diff --git a/.gitignore b/.gitignore
index a2009a1ed30a1c..801790d0a47208 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,12 +6,14 @@ paddle/fluid/eager/api/generated/*
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
 paddle/phi/api/backward/backward_api.h
+paddle/phi/api/backward/sparse_bw_api.h
 paddle/phi/api/include/api.h
 paddle/phi/api/include/sparse_api.h
 paddle/phi/api/lib/api.cc
 paddle/phi/api/lib/dygraph_api.*
 paddle/phi/api/lib/backward_api.cc
 paddle/phi/api/lib/sparse_api.cc
+paddle/phi/api/lib/sparse_bw_api.cc
 paddle/phi/extension.h
 paddle/phi/include/*
 paddle/phi/infermeta/generated.*
@@ -54,6 +56,7 @@ paddle/infrt/dialect/pd_ops.td
 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
 tools/infrt/kernels.json
+tools/infrt/kernel_signature.json
 paddle/infrt/dialect/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index a7a9e85ffd7314..9f6fd32ad986c4 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -100,8 +100,8 @@ endfunction()
 function(mlir_add_rewriter td_base)
   set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
   mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
-  add_public_tablegen_target(${td_base}_IncGen)
-  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+  add_public_tablegen_target(MLIR${td_base}IncGen)
+  add_dependencies(mlir-headers MLIR${td_base}IncGen)
 endfunction()
 
 # Execute the mlir script with infrt-exec program.
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 45a76fdc1f1a2a..cfbe68eecbaca5 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220307")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 7affd59de162d5..1291e60cfe4ce1 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -293,11 +293,11 @@ function(op_library TARGET)
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op"
     "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
-    
-            if ("${TARGET}" STREQUAL "${manual_pybind_op}")
-                set(pybind_flag 1)
-            endif()
-        endforeach()
+
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
 
     # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
@@ -478,7 +478,7 @@ function(op_library TARGET)
     if (${pybind_flag} EQUAL 0)
       # NOTE(*): activation use macro to regist the kernels, set use_op manually.
       if(${TARGET} STREQUAL "activation")
-        file(APPEND ${pybind_file} "USE_OP(relu);\n")
+        file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
       elseif(${TARGET} STREQUAL "fake_quantize")
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index f6e15758379ada..ebb686d8ad0f31 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -134,8 +134,8 @@ function(kernel_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
                 list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
             endif()
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
-                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
+                list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu)
             endif()
         endif()
         if (WITH_XPU)
@@ -197,92 +197,88 @@ function(kernel_library TARGET)
 
     # kernel source file level
     # level 1: base device kernel
-    # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
+    # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs
     # level 2: device-independent kernel
     # - common_srcs
     # level 3: Kernel implemented by reusing device-independent kernel
     # - selected_rows_srcs
+    set(base_device_kernels)
+    set(device_independent_kernel)
+    set(high_level_kernels)
 
-    # Build Target according different src organization
-    if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND
-        (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
-        # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
+    # 1. Base device kernel compile
+    if (${cpu_srcs_len} GREATER 0)
+        cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_cpu)
+    endif()
+    if (${gpu_srcs_len} GREATER 0)
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
+            nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
-        elseif (WITH_XPU_KP)
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
-                xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
-        else()
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
-                cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
-            endif()
+            hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
-    # If there are only specific device srcs, build target using this rule.
-    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
+        list(APPEND base_device_kernels ${TARGET}_gpu)
+    endif()
+    if (${xpu_srcs_len} GREATER 0)
+        cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_xpu)
+    endif()
+    if (${gpudnn_srcs_len} GREATER 0)
         if (WITH_GPU)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
+            nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
-            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
-                hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
-        elseif (WITH_XPU_KP)
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
-                xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
-        else()
-            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
-                cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            endif()
+            hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
-    # If the selected_rows_srcs depends on common_srcs, build target using this rule.
-    elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0)
+        list(APPEND base_device_kernels ${TARGET}_gpudnn)
+    endif()
+    if (${kps_srcs_len} GREATER 0)
+        # only when WITH_XPU_KP, the kps_srcs_len can be > 0
+        xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        list(APPEND base_device_kernels ${TARGET}_kps)
+    endif()
+
+    # 2. Device-independent kernel compile
+    if (${common_srcs_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         elseif (WITH_ROCM)
-            hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         elseif (WITH_XPU_KP)
-            xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         else()
-            cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+            cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
         endif()
-    # If there are only common_srcs or selected_rows_srcs, build target using below rules.
-    elseif (${common_srcs_len} GREATER 0)
+        list(APPEND device_independent_kernel ${TARGET}_common)
+    endif()
+
+    # 3. Reusing kernel compile
+    if (${selected_rows_srcs_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         elseif (WITH_XPU_KP)
-            xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         else()
-            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
         endif()
-    elseif (${selected_rows_srcs_len} GREATER 0)
+        list(APPEND high_level_kernels ${TARGET}_sr)
+    endif()
+
+    # 4. Unify target compile
+    list(LENGTH base_device_kernels base_device_kernels_len)
+    list(LENGTH device_independent_kernel device_independent_kernel_len)
+    list(LENGTH high_level_kernels high_level_kernels_len)
+    if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR
+        ${high_level_kernels_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         elseif (WITH_XPU_KP)
-            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         else()
-            cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
         endif()
     else()
         set(target_build_flag 0)
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 96bc4a710f8c1c..f88c993d85e2fa 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -7,3 +7,6 @@ cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
 if(WITH_NCCL)
     cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
 endif()
+if(WITH_ASCEND_CL)
+    cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
+endif()
diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h
new file mode 100644
index 00000000000000..09789bd4d37863
--- /dev/null
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <error.h>
+#include <string>
+
+#include "boost/variant.hpp"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/enforce_npu.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class NPUEventManager {
+ public:
+  NPUEventManager() = default;
+
+  ~NPUEventManager() {
+    if (is_created_) {
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUEventDestroy(event_);
+    }
+  }
+
+  NPUEventManager(const NPUEventManager&) = delete;
+  NPUEventManager& operator=(const NPUEventManager&) = delete;
+
+  NPUEventManager(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+
+  NPUEventManager& operator=(NPUEventManager&& other) {
+    std::swap(is_created_, other.is_created_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+    return *this;
+  }
+
+  bool IsCreated() const { return is_created_; }
+  bool DeviceId() const { return device_index_; }
+  aclrtEvent GetRawNPUEvent() const { return event_; }
+
+  void Record(const paddle::platform::NPUDeviceContext& ctx) {
+    auto device_index = ctx.GetPlace().device;
+    if (!is_created_) {
+      CreateEvent(device_index);
+    }
+    PADDLE_ENFORCE_EQ(device_index, device_index_,
+                      platform::errors::PreconditionNotMet(
+                          "NPUDeviceContext's device %d does not match"
+                          "Event's device %d",
+                          device_index, device_index_));
+
+    platform::NPUDeviceGuard guard(device_index_);
+    platform::NPUEventRecord(event_, ctx.stream());
+  }
+
+  bool Query() const {
+    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+    platform::NPUEventQuery(event_, &status);
+    if (status == ACL_EVENT_STATUS_COMPLETE) {
+      return true;
+    }
+    return false;
+  }
+
+  void Block(const paddle::platform::NPUDeviceContext& ctx) const {
+    if (is_created_) {
+      auto device_index = ctx.GetPlace().device;
+      PADDLE_ENFORCE_EQ(device_index, device_index_,
+                        platform::errors::PreconditionNotMet(
+                            "CUDADeviceContext's device %d does not match"
+                            "Event's device %d",
+                            device_index, device_index_));
+      platform::NPUDeviceGuard guard(device_index_);
+      platform::NPUStreamWaitEvent(ctx.stream(), event_);
+    }
+  }
+
+ private:
+  bool is_created_{false};
+  aclrtEvent event_{};
+  int8_t device_index_{0};
+
+ private:
+  void CreateEvent(int device_index) {
+    device_index_ = device_index;
+    platform::NPUDeviceGuard guard(device_index);
+    platform::NPUEventCreate(&event_);
+    is_created_ = true;
+  }
+};
+
+class HCCLCommManager {
+ public:
+  explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {}
+
+  HCCLCommManager() : HCCLCommManager(nullptr) {}
+
+  ~HCCLCommManager() noexcept {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (hccl_comm_) {
+      platform::dynload::HcclCommDestroy(hccl_comm_);
+    }
+  }
+
+  static std::shared_ptr<HCCLCommManager> Create(int num_ranks, int rank,
+                                                 HcclRootInfo* comm_id,
+                                                 HcclComm hccl_comm) {
+    auto hccl_manager = std::make_shared<HCCLCommManager>();
+    auto ret = platform::dynload::HcclCommInitRootInfo(num_ranks, comm_id, rank,
+                                                       &hccl_comm);
+    using __NPU_STATUS_TYPE__ = decltype(ret);
+    constexpr auto __success_type__ =
+        platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess;
+    if (UNLIKELY(ret != __success_type__)) {
+      VLOG(0) << "Error: create hccl_id error.";
+      exit(-1);
+    }
+
+    hccl_manager->hccl_id_ = comm_id;
+    hccl_manager->rank_ = rank;
+    hccl_manager->hccl_comm_ = hccl_comm;
+    return hccl_manager;
+  }
+
+  HcclRootInfo* GetHcclId() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_id_;
+  }
+
+  HcclComm GetHcclComm() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return hccl_comm_;
+  }
+
+  HCCLCommManager(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(const HCCLCommManager&) = delete;
+  HCCLCommManager& operator=(HCCLCommManager&& other) = delete;
+
+  HCCLCommManager(HCCLCommManager&& other) {
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(hccl_comm_, other.hccl_comm_);
+  }
+
+ protected:
+  HcclComm hccl_comm_;
+  HcclRootInfo* hccl_id_;
+  int rank_;
+  mutable std::mutex mutex_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
new file mode 100644
index 00000000000000..84f5ca48d25c84
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -0,0 +1,356 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+
+DECLARE_bool(hccl_blocking_wait);
+// DECLARE_bool(use_stream_safe_npu_allocator);
+
+constexpr int64_t kWaitBlockTImeout = 10;
+
+namespace paddle {
+namespace distributed {
+
+static HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, HcclReduceOp> red_type = {
+      {ReduceOp::MIN, HCCL_REDUCE_MIN},
+      {ReduceOp::MAX, HCCL_REDUCE_MAX},
+      {ReduceOp::SUM, HCCL_REDUCE_SUM},
+      {ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(
+      it != red_type.end(), true,
+      platform::errors::InvalidArgument("Invalid hccl reduction. "
+                                        "Must be Min | Max | Prod | Sum"));
+  return it->second;
+}
+
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&hcclID);
+  std::ostringstream oss;
+  for (size_t i = 0; i < sizeof(hcclID); ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+// Get the list of devices from list of tensors
+std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors) {
+  std::vector<Place> places;
+  places.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    places.push_back(tensor.inner_place());
+  }
+  return places;
+}
+
+// Get the deviceList String from the list of devices
+std::string GetKeyFromPlaces(const std::vector<Place>& places) {
+  std::string placeList;
+  for (auto& place : places) {
+    std::stringstream tmp;
+    tmp << place;
+    if (placeList.empty()) {
+      placeList += tmp.str();
+    } else {
+      placeList += "," + tmp.str();
+    }
+  }
+  return placeList;
+}
+
+// bool CheckTensorsInNPUPlace(const std::vector<Tensor>& tensors) {
+//   return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+//     return t.place() == platform::DeviceType::NPU;
+//   });
+// }
+
+void SyncDefaultStream(
+    const std::vector<Place>& places,
+    std::vector<NPUEventManager>& hcclEvents,                   // NOLINT
+    std::vector<std::unique_ptr<NPUDeviceContext>>& dev_ctx) {  // NOLINT
+  for (size_t i = 0; i < places.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places[i]));
+    hcclEvents[i].Record(*dev_ctx[i]);
+    hcclEvents[i].Block(*default_ctx);
+  }
+}
+
+std::shared_ptr<ProcessGroupHCCL::HCCLTask> ProcessGroupHCCL::CreateTask(
+    std::vector<Place> places, int rank, CommType comm_type,
+    const std::vector<Tensor>& inputs) {
+  return std::make_shared<ProcessGroupHCCL::HCCLTask>(places, rank, comm_type,
+                                                      inputs);
+}
+
+ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector<Place>& places, int rank,
+                                     CommType CommType,
+                                     const std::vector<Tensor>& inputs)
+    : Task(rank, inputs, CommType), places_(places) {
+  control_events_.resize(places.size());
+  hcclComms_.resize(places.size());
+}
+
+ProcessGroupHCCL::HCCLTask::~HCCLTask() {}
+
+void ProcessGroupHCCL::HCCLTask::SetOutputs(
+    std::vector<Tensor>& outputs) {  // NOLINT
+  outputs_ = std::make_shared<std::vector<Tensor>>(outputs);
+}
+
+void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto* default_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(places_[i]));
+    platform::NPUStreamWaitEvent(default_ctx->stream(),
+                                 control_events_[i].GetRawNPUEvent());
+  }
+}
+
+bool ProcessGroupHCCL::HCCLTask::IsCompleted() {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    if (!control_events_[i].Query()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// TODO(sandyhouse): Add timeout for wait, now timeout unused
+bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
+  SynchronizeStreams();
+  if (FLAGS_hccl_blocking_wait) {
+    // NOTE(sandyhouse): It will block host for sync
+    while (!IsCompleted()) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout));
+    }
+  }
+  return true;
+}
+
+// Same as Wait
+void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); }
+
+ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr<Store>& store,
+                                   int rank, int size)
+    : ProcessGroup(rank, size), store_(store) {}
+
+void ProcessGroupHCCL::BroadcastUniqueHCCLID(
+    std::vector<HcclRootInfo>& hccl_ids) {  // NOLINT
+  if (rank_ == 0) {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto hccl_id = std::vector<uint8_t>(
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]),
+          reinterpret_cast<uint8_t*>(&hccl_ids[i]) + sizeof(HcclRootInfo));
+      store_->set(key, hccl_id);
+    }
+  } else {
+    for (size_t i = 0; i < hccl_ids.size(); i++) {
+      auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i);
+      auto ret = store_->get(key);
+      std::memcpy(&hccl_ids[i], ret.data(), ret.size());
+    }
+  }
+}
+
+// create HCCLManager cache for places_key
+void ProcessGroupHCCL::CreateHCCLManagerCache(
+    const std::string& places_key, const std::vector<Place>& places) {
+  PADDLE_ENFORCE_EQ(places_key.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "Not able to create/get the HCCL Communicator since "
+                        "the NPU place are not known"));
+
+  std::vector<std::shared_ptr<HCCLCommManager>> hccl_comms;
+  hccl_comms.resize(places.size());
+
+  // using vector just for broadcast
+  std::vector<HcclRootInfo> hccl_ids;
+  hccl_ids.resize(1);
+  auto& hccl_id = hccl_ids.front();
+
+  if (rank_ == 0) {
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id));
+  }
+  BroadcastUniqueHCCLID(hccl_ids);
+
+  VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
+          << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id);
+
+  std::vector<std::unique_ptr<NPUDeviceContext>> dev_ctx;
+  dev_ctx.resize(places.size());
+
+  std::unique_ptr<HcclComm[]> comms(new HcclComm[places.size()]);
+  for (size_t i = 0; i < places.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    hccl_comms[i] = HCCLCommManager::Create(GetSize(), GetRank(), &hccl_id,
+                                            comms.get() + i);
+    dev_ctx[i].reset(new NPUDeviceContext(places[i]));
+  }
+
+  std::vector<NPUEventManager> events;
+  events.resize(places.size());
+
+  // These caches will be useful to process sync/wait/communicate
+  places_to_events_.emplace(places_key, std::move(events));
+  places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms));
+  places_to_ctx_.emplace(places_key, std::move(dev_ctx));
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Collective(
+    std::vector<Tensor>& inputs, std::vector<Tensor>& outputs, Fn fn,
+    CommType op_type) {
+  const auto places = GetPlaceList(inputs);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
+      CreateHCCLManagerCache(key, places);
+    }
+  }
+
+  auto& hccl_comms = places_to_hcclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, inputs);
+  task->SetOutputs(outputs);
+
+  // if (FLAGS_use_stream_safe_npu_allocator) {
+  //   for (size_t i = 0; i < inputs.size(); ++i) {
+  //     platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+  //     auto dense_tensor =
+  //         std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl());
+  //     memory::RecordStream(dense_tensor->Holder(),
+  //                          places_to_ctx_[key][i]->stream());
+  //   }
+  // }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    const auto& hccl_stream = places_to_ctx_[key][i]->stream();
+    fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream);
+  }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::PointToPoint(
+    std::vector<Tensor>& tensors, Fn fn, int dst_rank, CommType op_type) {
+  const auto places = GetPlaceList(tensors);
+  const auto key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) {
+      CreateHCCLManagerCache(key, places);
+    }
+  }
+
+  auto& hccl_comms = places_to_hcclcomm_[key];
+
+  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+
+  auto task = CreateTask(places, rank_, op_type, tensors);
+
+  // construct uninitialize guard for device
+
+  // if (FLAGS_use_stream_safe_npu_allocator) {
+  //   for (size_t i = 0; i < tensors.size(); ++i) {
+  //     platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+  //     auto dense_tensor =
+  //         std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
+  //     memory::RecordStream(dense_tensor->Holder(),
+  //                          places_to_ctx_[key][i]->stream());
+  //   }
+  // }
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    const auto& hccl_stream = places_to_ctx_[key][i]->stream();
+    fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank);
+  }
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    platform::NPUDeviceGuard guard(places[i].GetDeviceId());
+    task->control_events_[i].Record(*places_to_ctx_[key][i]);
+  }
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::AllReduce(
+    std::vector<Tensor>& tensors, const AllreduceOptions& opts) {
+  // PADDLE_ENFORCE_EQ(
+  //     CheckTensorsInNPUPlace(tensors), true,
+  //     platform::errors::InvalidArgument("All inputs should be in
+  //     NPUPlace."));
+  return Collective(
+      tensors, tensors,
+      [&](const Tensor& input, Tensor& output, HcclComm comm,
+          const aclrtStream& stream) {
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::HcclAllReduce(
+            input_tensor->data(), output_tensor->data(), input_tensor->numel(),
+            platform::ToHCCLDataType(input.type()),
+            ToHCCLRedType(opts.reduce_op), comm, stream);
+      },
+      CommType::ALLREDUCE);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHCCL::Broadcast(
+    std::vector<Tensor>& tensors, const BroadcastOptions& opts) {
+  // PADDLE_ENFORCE_EQ(
+  //     CheckTensorsInNPUPlace(tensors), true,
+  //     platform::errors::InvalidArgument("All inputs should be in
+  //     CudaPlace."));
+
+  return Collective(
+      tensors, tensors,
+      [&](Tensor& input, Tensor& output, HcclComm comm,
+          const aclrtStream& stream) {
+        const auto root = opts.source_rank * tensors.size() + opts.source_root;
+        auto input_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(input.impl());
+        auto output_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(output.impl());
+        return platform::dynload::HcclBroadcast(
+            input_tensor->data(), input_tensor->numel(),
+            platform::ToHCCLDataType(input.type()), root, comm, stream);
+      },
+      CommType::BROADCAST);
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
new file mode 100644
index 00000000000000..f2376b4eed7600
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
@@ -0,0 +1,152 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/platform/device/npu/npu_stream.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+
+constexpr const char* HCCL_BACKEND_NAME = "HCCL";
+
+namespace paddle {
+namespace distributed {
+
+using Place = paddle::platform::Place;
+using NPUStream = platform::stream::NPUStream;
+using NPUDeviceContext = paddle::platform::NPUDeviceContext;
+
+class ProcessGroupHCCL : public ProcessGroup {
+ public:
+  class HCCLTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<HCCLTask> {
+   public:
+    HCCLTask(const std::vector<Place>& places, int rank, CommType CommType,
+             const std::vector<Tensor>& inputs);
+
+    bool IsCompleted();
+
+    void SynchronizeStreams();
+
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+
+    void Synchronize();
+
+    void SetOutputs(std::vector<Tensor>& outputs);  // NOLINT
+
+    virtual ~HCCLTask();
+
+    std::vector<NPUEventManager> control_events_;
+
+   protected:
+    std::vector<Place> places_;
+    std::vector<std::shared_ptr<HCCLCommManager>> hcclComms_;
+    std::shared_ptr<std::vector<Tensor>> outputs_;
+
+   private:
+  };
+
+  ProcessGroupHCCL(const std::shared_ptr<Store>& store, int rank, int size);
+
+  const std::string GetBackendName() const override {
+    return std::string(HCCL_BACKEND_NAME);
+  }
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Barrier(
+      const BarrierOptions& = BarrierOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Send(std::vector<Tensor>& tensors,
+                                           int dst_rank) override;
+
+  std::shared_ptr<ProcessGroup::Task> Recv(std::vector<Tensor>& tensors,
+                                           int src_rank) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<Tensor>& in_tensors,
+      std::vector<Tensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<Tensor>& in, std::vector<Tensor>& out) override;
+
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<Tensor>& tensors, const ReduceOptions& opts) override;
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(std::vector<Tensor>& in_tensors,
+                                              std::vector<Tensor>& out_tensors,
+                                              const ScatterOptions&) override;
+
+ protected:
+  virtual std::shared_ptr<ProcessGroupHCCL::HCCLTask> CreateTask(
+      std::vector<Place> places, int rank, CommType opType,
+      const std::vector<Tensor>& inputs);
+
+  std::shared_ptr<Store> store_;
+  std::shared_ptr<HCCLCommManager> hccl_comm_;
+  std::mutex mutex_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLCommManager>>>
+      places_to_hcclcomm_;
+
+  std::unordered_map<std::string, std::vector<NPUEventManager>>
+      places_to_events_;
+
+  std::unordered_map<std::string,
+                     std::vector<std::unique_ptr<NPUDeviceContext>>>
+      places_to_ctx_;
+
+  std::set<int> used_place_ids_;
+
+ private:
+  void BcastHCCLId(std::vector<HcclRootInfo>& hccl_ids, int root,  // NOLINT
+                   int server_fd);
+
+  void BroadcastUniqueHCCLID(std::vector<HcclRootInfo>& hccl_ids);  // NOLINT
+
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> Collective(
+      std::vector<Tensor>& inputs,   // NOLINT
+      std::vector<Tensor>& outputs,  // NOLINT
+      Fn fn, CommType op_type);
+
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> PointToPoint(
+      std::vector<Tensor>& tensors,  // NOLINT
+      Fn fn, int dst_rank, CommType op_type);
+
+  void CreateHCCLManagerCache(const std::string& places_key,
+                              const std::vector<Place>& places);
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 88d8fb69eb6980..67715f410d443c 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -156,36 +156,27 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
 // Same as Wait
 void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); }
 
-ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy,
+ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr<Store>& store,
                                    int rank, int size)
-    : ProcessGroup(rank, size), strategy_(strategy) {}
-
-void ProcessGroupNCCL::BcastNCCLId(
-    std::vector<ncclUniqueId>& nccl_ids,  // NOLINT
-    int root, int server_fd) {
-  if (strategy_.local_rank_ == root) {
-    std::vector<std::string> other_trainers;
-    for (auto& ep : strategy_.trainer_endpoints_) {
-      if (ep != strategy_.current_endpoint_) {
-        other_trainers.push_back(ep);
-      }
-    }
-    platform::SendBroadCastCommID(other_trainers, &nccl_ids);
-  } else {
-    platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_,
-                                  &nccl_ids);
-  }
-}
+    : ProcessGroup(rank, size), store_(store) {}
 
 void ProcessGroupNCCL::BroadcastUniqueNCCLID(
     std::vector<ncclUniqueId>& nccl_ids) {  // NOLINT
-
-  int server_fd = -1;
-  if (rank_ != 0) {
-    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
-                    .socket();
+  if (rank_ == 0) {
+    for (size_t i = 0; i < nccl_ids.size(); i++) {
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i);
+      auto nccl_id = std::vector<uint8_t>(
+          reinterpret_cast<uint8_t*>(&nccl_ids[i]),
+          reinterpret_cast<uint8_t*>(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES);
+      store_->set(key, nccl_id);
+    }
+  } else {
+    for (size_t i = 0; i < nccl_ids.size(); i++) {
+      auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i);
+      auto ret = store_->get(key);
+      std::memcpy(&nccl_ids[i], ret.data(), ret.size());
+    }
   }
-  BcastNCCLId(nccl_ids, 0, server_fd);
 }
 
 // create NCCLManager cache for places_key
@@ -213,8 +204,8 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
   }
   BroadcastUniqueNCCLID(nccl_ids);
 
-  VLOG(3) << "init nccl rank: " << strategy_.local_rank_
-          << ", nranks: " << strategy_.nranks_ << ", place: " << places_key
+  VLOG(3) << "init nccl rank: " << rank_ << ", nranks: " << size_
+          << ", place: " << places_key
           << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
 
   std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index d63a5e768382c6..aa2a2b8fa2088c 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
@@ -75,7 +76,7 @@ class ProcessGroupNCCL : public ProcessGroup {
    private:
   };
 
-  ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size);
+  ProcessGroupNCCL(const std::shared_ptr<Store>& store, int rank, int size);
 
   const std::string GetBackendName() const override {
     return std::string(NCCL_BACKEND_NAME);
@@ -118,7 +119,7 @@ class ProcessGroupNCCL : public ProcessGroup {
       const std::vector<Tensor>& inputs);
 
  protected:
-  ProcessGroupStrategy strategy_;
+  std::shared_ptr<Store> store_;
   std::shared_ptr<NCCLCommManager> nccl_comm_;
   std::mutex mutex_;
   std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLCommManager>>>
diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h
index 2581a74d7e8187..7b4ae7e70ff6f0 100644
--- a/paddle/fluid/distributed/store/store.h
+++ b/paddle/fluid/distributed/store/store.h
@@ -25,15 +25,26 @@ namespace distributed {
 
 class Store {
  public:
-  Store() = delete;
+  Store() : _timeout(tcputils::kNoTimeout) {}
   explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {}
   virtual ~Store() = default;
 
-  virtual int64_t add(const std::string& key, int64_t value) = 0;
-  virtual std::vector<uint8_t> get(const std::string& key) = 0;
-  virtual void wait(const std::string& key) = 0;
-  virtual void set(const std::string& key,
-                   const std::vector<uint8_t>& value) = 0;
+  virtual int64_t add(const std::string& key, int64_t value) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual std::vector<uint8_t> get(const std::string& key) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual void wait(const std::string& key) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
+  virtual void set(const std::string& key, const std::vector<uint8_t>& value) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Implement the add method in the subclass."));
+  }
 
   virtual const std::chrono::seconds& timeout() const { return _timeout; }
 
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 8cb69caf663696..698a698fc6d184 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
+set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node)
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 set(generated_deps dygraph_function dygraph_node)
 
@@ -10,11 +10,11 @@ endif()
 add_subdirectory(api)
 add_subdirectory(accumulation)
 
-cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api)
+cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
 cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
 
-cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api)
-cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
+cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor)
+cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
 cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 2fc846cccc22e8..dc79a8a45a2467 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -47,6 +47,9 @@ std::unordered_map<std::string, std::vector<std::string>>
 static std::unordered_map<std::string, paddle::framework::AttributeMap>
     operators_with_attrs = {};
 
+/* --- Black Ops list that's NO NEED to apply code generation --- */
+static std::unordered_set<std::string> black_ops_list = {"run_program"};
+
 static std::string LegalizeVariableName(const std::string& var_name) {
   std::string ret = var_name;
   std::replace(ret.begin(), ret.end(), '-', '_');  // replace all '-' to '_'
@@ -73,12 +76,6 @@ static bool IgnoreGradAttribute(const std::string& op_type,
 }
 
 static void PrepareAttrMapForOps() {
-  // Handle "run_program_op"
-  static framework::ProgramDesc fake_prog;
-  operators_with_attrs["run_program"] = {};
-  operators_with_attrs["run_program"]["global_block"] =
-      fake_prog.MutableBlock(0);
-
   // Handle "fused_elemwise_add_activation"
   std::vector<std::string> functor_list = {"a", "b"};
   operators_with_attrs["fused_elemwise_add_activation"] = {};
@@ -2349,6 +2346,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
 
     if (!CheckOpProto(op_proto)) continue;
     const std::string& op_type = op_proto->type();
+    if (black_ops_list.count(op_type)) {
+      continue;
+    }
 
     /* ----------------------------- */
     /* ---- Collect Information ---- */
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index c6bca01205e19c..53af6c1048d245 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml")
-set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml")
+set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml")
+set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml")
 set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc")
 set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h")
 set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc")
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index d1e208541537c8..4f6f437163a8dc 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -23,12 +23,13 @@
 core_ops_args_info = {}
 core_ops_args_type_info = {}
 
+namespace = ""
 
 yaml_types_mapping = {
-    'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t',  'size_t' : 'size_t', \
+    'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t',  'size_t' : 'size_t', \
     'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
     'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
-    'int64_t[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
+    'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
     'Tensor[]' : 'std::vector<Tensor>',
     'Tensor[Tensor[]]' : 'std::vector<std::vector<Tensor>>',
@@ -125,6 +126,7 @@ def GetAutoGradMetaVectorName(string):
 def ReadFwdFile(filepath):
     f = open(filepath, 'r')
     contents = yaml.load(f, Loader=yaml.FullLoader)
+    f.close()
     return contents
 
 
@@ -133,9 +135,13 @@ def ReadBwdFile(filepath):
     contents = yaml.load(f, Loader=yaml.FullLoader)
     ret = {}
     for content in contents:
-        assert 'backward_api' in content.keys()
-        api_name = content['backward_api']
+        if 'backward_api' in content.keys():
+            api_name = content['backward_api']
+        else:
+            assert False
+
         ret[api_name] = content
+    f.close()
     return ret
 
 
@@ -608,16 +614,23 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
     returns_str += f"return returns;\n"
 
     grad_node_name = GetGradNodeName(fwd_api_name)
+
+    if len(namespace) > 0:
+        grad_api_namespace = f"paddle::experimental::{namespace}"
+    else:
+        grad_api_namespace = f"paddle::experimental"
+
     FUNCTION_TEMPLATE = """
 std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
     // Call grad_api function
-    auto grad_api_returns = paddle::experimental::{}({});
+    auto grad_api_returns = {}::{}({});
     {}
 }}
   """
 
     node_definition_str = FUNCTION_TEMPLATE.format(
-        grad_node_name, bwd_api_name, grad_api_args_str, returns_str)
+        grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str,
+        returns_str)
 
     return node_definition_str
 
@@ -671,7 +684,7 @@ def GenerateNodeCreationCodes(
         else:
             # Tuple api_result
             if IsPlainTensorType(rtype):
-                outputs_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
+                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
             else:
                 assert IsVectorTensorType(rtype)
                 output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n"
@@ -850,7 +863,11 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
         function_name = fwd_api_name
     else:
         function_name = fwd_api_name + "_intermediate"
-    forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
+
+    if len(namespace) > 0:
+        forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});"
+    else:
+        forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});"
 
     # Get return type list & outputs
     num_outputs = len(forward_outputs_position_map.keys()) - len(
@@ -1000,7 +1017,9 @@ def GenerateNodeCCFile(filepath, node_definition_str):
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
+#include "paddle/fluid/eager/to_static/run_program_op_node.h"
 
+#include "paddle/phi/api/include/sparse_api.h"
 """
     file_contents += node_definition_str
     with open(filepath, 'a') as f:
@@ -1024,6 +1043,7 @@ def GenerateForwardCCFile(filepath, forward_definition_str):
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 
+#include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 
 """
@@ -1042,6 +1062,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
 #include "paddle/phi/api/all.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/eager/to_static/run_program_op_func.h"
 
 """
     file_contents += GenerateCoreOpInfoDeclaration()
@@ -1053,134 +1074,184 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
 if __name__ == "__main__":
     args = ParseArguments()
 
-    api_yaml_path = args.api_yaml_path
-    backward_yaml_path = args.backward_yaml_path
-
-    fwd_api_list = ReadFwdFile(api_yaml_path)
-    grad_api_dict = ReadBwdFile(backward_yaml_path)
+    api_yaml_paths = args.api_yaml_path.split(",")
+    backward_yaml_paths = args.backward_yaml_path.split(",")
 
     # Generate per Dygraph API
     node_declaration_str = ""
     node_definition_str = ""
     forward_definition_str = ""
     forward_declaration_str = ""
-    for fwd_api in fwd_api_list:
-        # We only generate Ops with grad
-        if 'backward' not in fwd_api.keys():
-            continue
 
-        assert 'api' in fwd_api.keys()
-        assert 'args' in fwd_api.keys()
-        assert 'output' in fwd_api.keys()
-        assert 'backward' in fwd_api.keys()
-
-        no_need_buffer_set = set()
-        if 'no_need_buffer' in fwd_api.keys():
-            no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer'])
-
-        fwd_api_name = fwd_api['api']
-        fwd_args_str = fwd_api['args']
-        fwd_returns_str = fwd_api['output']
-
-        bwd_api_name = fwd_api['backward']
-        assert bwd_api_name in grad_api_dict.keys()
-        bwd_api = grad_api_dict[bwd_api_name]
-
-        assert 'args' in bwd_api.keys()
-        assert 'output' in bwd_api.keys()
-        assert 'forward' in bwd_api.keys()
-
-        # Parse Dispensable Inputs
-        optional_inputs = []
-        if 'optional' in fwd_api.keys():
-            optional_inputs = ParseDispensable(fwd_api['optional'])
-
-        bwd_forward_str = bwd_api['forward']
-        bwd_args_str = bwd_api['args']
-        bwd_returns_str = bwd_api['output']
-
-        # Collect Forward Inputs/Outputs
-        forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
-            bwd_forward_str)
-        print("Parsed Forward Inputs List: ", forward_inputs_list)
-        print("Prased Forward Attrs List: ", forward_attrs_list)
-        print("Parsed Forward Returns List: ", forward_returns_list)
-
-        intermediate_outputs = []
-        if 'intermediate' in fwd_api.keys():
-            intermediate_outputs = ParseIntermediate(fwd_api['intermediate'])
-
-        IntermediateValidationCheck(intermediate_outputs, forward_returns_list)
-
-        # Collect Original Forward Inputs/Outputs and then perform validation checks
-        orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
-            fwd_args_str, fwd_returns_str)
-        print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list)
-        print("Prased Original Forward Attrs List: ", orig_forward_attrs_list)
-        print("Parsed Original Forward Returns List: ",
-              orig_forward_returns_list)
-
-        # Forward Validation Checks
-        ForwardsValidationCheck(forward_inputs_list, forward_attrs_list,
-                                forward_returns_list, orig_forward_inputs_list,
-                                orig_forward_attrs_list,
-                                orig_forward_returns_list)
-
-        # Parse Backward Inputs/Outputs
-        backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
-            bwd_args_str, bwd_returns_str)
-        print("Parsed Backward Inputs List: ", backward_inputs_list)
-        print("Prased Backward Attrs List: ", backward_attrs_list)
-        print("Parsed Backward Returns List: ", backward_returns_list)
-
-        # Determine Forward Inputs/Outputs Position
-        forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-            forward_inputs_list, forward_returns_list)
-        print("Generated Forward Input Position Map: ",
-              forward_inputs_position_map)
-        print("Generated Forward Output Position Map: ",
-              forward_outputs_position_map)
-
-        # SlotName Matching
-        backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
-            backward_inputs_list, backward_returns_list,
-            forward_inputs_position_map, forward_outputs_position_map)
-        print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
-        print("Generated Backward Grad Input Map: ", backward_grad_input_map)
-        print("Generated Backward Grad Output Map: ", backward_grad_output_map)
-
-        # Backward Validation Check
-        BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map,
-                                backward_attrs_list)
-
-        # Node Declaration Generation
-        node_declaration_str += GenerateNodeDeclaration(
-            fwd_api_name, backward_fwd_input_map, backward_attrs_list,
-            no_need_buffer_set)
-        print("Generated Node Declaration: ", node_declaration_str)
-
-        node_definition_str += GenerateNodeDefinition(
-            fwd_api_name, bwd_api_name, backward_fwd_input_map,
-            backward_grad_input_map, backward_grad_output_map,
-            backward_attrs_list)
-        print("Generated Node Definition: ", node_definition_str)
-
-        # Node Definition Generation
-        definition_declaration_pair = GenerateForwardDefinition(
-            fwd_api_name, bwd_api_name, forward_inputs_position_map,
-            forward_outputs_position_map, forward_attrs_list,
-            backward_fwd_input_map, backward_grad_input_map,
-            backward_grad_output_map, backward_attrs_list, optional_inputs,
-            intermediate_outputs)
-        print("Generated Forward Definition: ", forward_definition_str)
-        print("Generated Forward Declaration: ", forward_declaration_str)
-        forward_definition_str += definition_declaration_pair[0]
-        forward_declaration_str += definition_declaration_pair[1]
-
-        # For python-level API dispatch
-        CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
-                                  forward_outputs_position_map,
-                                  forward_attrs_list)
+    for i in range(len(api_yaml_paths)):
+        api_yaml_path = api_yaml_paths[i]
+        backward_yaml_path = backward_yaml_paths[i]
+
+        if "sparse" in api_yaml_path:
+            assert "sparse" in backward_yaml_path
+            namespace = "sparse"
+        else:
+            namespace = ""
+
+        fwd_api_list = ReadFwdFile(api_yaml_path)
+        grad_api_dict = ReadBwdFile(backward_yaml_path)
+
+        yaml_forward_definition_str = ""
+        yaml_forward_declaration_str = ""
+        yaml_node_declaration_str = ""
+        yaml_node_definition_str = ""
+        for fwd_api in fwd_api_list:
+            # We only generate Ops with grad
+            if 'backward' not in fwd_api.keys():
+                continue
+
+            assert 'api' in fwd_api.keys()
+            assert 'args' in fwd_api.keys()
+            assert 'output' in fwd_api.keys()
+            assert 'backward' in fwd_api.keys()
+
+            no_need_buffer_set = set()
+            if 'no_need_buffer' in fwd_api.keys():
+                no_need_buffer_set = ParseNoNeedBuffer(fwd_api[
+                    'no_need_buffer'])
+
+            fwd_api_name = fwd_api['api']
+            fwd_args_str = fwd_api['args']
+            fwd_returns_str = fwd_api['output']
+
+            bwd_api_name = fwd_api['backward']
+            assert bwd_api_name in grad_api_dict.keys()
+            bwd_api = grad_api_dict[bwd_api_name]
+
+            assert 'args' in bwd_api.keys()
+            assert 'output' in bwd_api.keys()
+            assert 'forward' in bwd_api.keys()
+
+            # Parse Dispensable Inputs
+            optional_inputs = []
+            if 'optional' in fwd_api.keys():
+                optional_inputs = ParseDispensable(fwd_api['optional'])
+
+            bwd_forward_str = bwd_api['forward']
+            bwd_args_str = bwd_api['args']
+            bwd_returns_str = bwd_api['output']
+
+            # Collect Forward Inputs/Outputs
+            forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward(
+                bwd_forward_str)
+            print("Parsed Forward Inputs List: ", forward_inputs_list)
+            print("Prased Forward Attrs List: ", forward_attrs_list)
+            print("Parsed Forward Returns List: ", forward_returns_list)
+
+            intermediate_outputs = []
+            if 'intermediate' in fwd_api.keys():
+                intermediate_outputs = ParseIntermediate(fwd_api[
+                    'intermediate'])
+
+            IntermediateValidationCheck(intermediate_outputs,
+                                        forward_returns_list)
+
+            # Collect Original Forward Inputs/Outputs and then perform validation checks
+            orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward(
+                fwd_args_str, fwd_returns_str)
+            print("Parsed Original Forward Inputs List: ",
+                  orig_forward_inputs_list)
+            print("Prased Original Forward Attrs List: ",
+                  orig_forward_attrs_list)
+            print("Parsed Original Forward Returns List: ",
+                  orig_forward_returns_list)
+
+            # Forward Validation Checks
+            ForwardsValidationCheck(
+                forward_inputs_list, forward_attrs_list, forward_returns_list,
+                orig_forward_inputs_list, orig_forward_attrs_list,
+                orig_forward_returns_list)
+
+            # Parse Backward Inputs/Outputs
+            backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward(
+                bwd_args_str, bwd_returns_str)
+            print("Parsed Backward Inputs List: ", backward_inputs_list)
+            print("Prased Backward Attrs List: ", backward_attrs_list)
+            print("Parsed Backward Returns List: ", backward_returns_list)
+
+            # Determine Forward Inputs/Outputs Position
+            forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
+                forward_inputs_list, forward_returns_list)
+            print("Generated Forward Input Position Map: ",
+                  forward_inputs_position_map)
+            print("Generated Forward Output Position Map: ",
+                  forward_outputs_position_map)
+
+            # SlotName Matching
+            backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching(
+                backward_inputs_list, backward_returns_list,
+                forward_inputs_position_map, forward_outputs_position_map)
+            print("Generated Backward Fwd Input Map: ", backward_fwd_input_map)
+            print("Generated Backward Grad Input Map: ",
+                  backward_grad_input_map)
+            print("Generated Backward Grad Output Map: ",
+                  backward_grad_output_map)
+
+            # Backward Validation Check
+            BackwardValidationCheck(backward_fwd_input_map,
+                                    backward_grad_input_map,
+                                    backward_attrs_list)
+
+            # Node Declaration Generation
+            yaml_node_declaration_str += GenerateNodeDeclaration(
+                fwd_api_name, backward_fwd_input_map, backward_attrs_list,
+                no_need_buffer_set)
+            print("Generated Node Declaration: ", node_declaration_str)
+
+            yaml_node_definition_str += GenerateNodeDefinition(
+                fwd_api_name, bwd_api_name, backward_fwd_input_map,
+                backward_grad_input_map, backward_grad_output_map,
+                backward_attrs_list)
+            print("Generated Node Definition: ", node_definition_str)
+
+            # Node Definition Generation
+            definition_declaration_pair = GenerateForwardDefinition(
+                fwd_api_name, bwd_api_name, forward_inputs_position_map,
+                forward_outputs_position_map, forward_attrs_list,
+                backward_fwd_input_map, backward_grad_input_map,
+                backward_grad_output_map, backward_attrs_list, optional_inputs,
+                intermediate_outputs)
+            print("Generated Forward Definition: ", forward_definition_str)
+            print("Generated Forward Declaration: ", forward_declaration_str)
+            yaml_forward_definition_str += definition_declaration_pair[0]
+            yaml_forward_declaration_str += definition_declaration_pair[1]
+
+            # For python-level API dispatch
+            CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
+                                      forward_outputs_position_map,
+                                      forward_attrs_list)
+
+        if len(namespace) > 0:
+            forward_definition_str += f"""namespace {namespace} {{
+    {yaml_forward_definition_str}
+}}
+"""
+
+            forward_declaration_str += f"""namespace {namespace} {{
+    {yaml_forward_declaration_str}
+}}
+"""
+
+            node_declaration_str += f"""namespace {namespace} {{
+    {yaml_node_declaration_str}
+}}
+"""
+
+            node_definition_str += f"""namespace {namespace} {{
+    {yaml_node_definition_str}
+}}
+"""
+
+        else:
+            forward_definition_str += yaml_forward_definition_str
+            forward_declaration_str += yaml_forward_declaration_str
+            node_declaration_str += yaml_node_declaration_str
+            node_definition_str += yaml_node_definition_str
 
     # Generate Files
     nodes_h_path = args.nodes_h_path
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index d0506e45eb476c..abf3f86bdb03b8 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -14,7 +14,7 @@
 
 import os
 import argparse
-from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
 
 skipped_fwd_api_names = set(["scale"])
 
@@ -126,16 +126,20 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
 }}
 
 """
+    namespace_str = ""
+    if len(namespace) > 0:
+        namespace_str = f"{namespace}::"
+
     if is_forward_only:
-        fwd_function_name = fwd_api_name
+        fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name
     else:
-        fwd_function_name = GetForwardFunctionName(fwd_api_name)
+        fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name)
 
     python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
         fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
         fwd_function_name, dygraph_function_call_str)
 
-    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
+    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
 
     return python_c_function_str, python_c_function_reg_str
 
@@ -189,7 +193,7 @@ def GenerateCoreOpsInfoMap():
     """
 
     core_ops_infos_registry = """
-    ,{\"get_final_state_core_ops_args_info\",
+    {\"get_final_state_core_ops_args_info\",
     (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
     \"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
     {\"get_final_state_core_ops_args_type_info\",
@@ -222,6 +226,7 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
 #include  "paddle/phi/common/data_type.h"
 #include  "paddle/phi/common/scalar.h"
 #include  "paddle/phi/common/scalar_array.h"
+#include  "paddle/phi/api/include/sparse_api.h"
 #include  "paddle/fluid/pybind/op_function_common.h"
 #include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include  "paddle/fluid/pybind/exception.h"
@@ -254,57 +259,80 @@ def GeneratePythonCFile(filepath, python_c_str):
 if __name__ == "__main__":
     args = ParseArguments()
 
-    api_yaml_path = args.api_yaml_path
-    fwd_api_list = ReadFwdFile(api_yaml_path)
-
-    python_c_function_list = []
-    python_c_function_reg_list = []
-    for fwd_api in fwd_api_list:
-
-        # We only generate Ops with grad
-        is_forward_only = False
-        if 'backward' not in fwd_api.keys():
-            is_forward_only = True
-
-        assert 'api' in fwd_api.keys()
-        assert 'args' in fwd_api.keys()
-        assert 'output' in fwd_api.keys()
-
-        fwd_api_name = fwd_api['api']
-        fwd_args_str = fwd_api['args']
-        fwd_returns_str = fwd_api['output']
-
-        if fwd_api_name in skipped_fwd_api_names:
-            continue
-
-        # Parse Dispensable Inputs
-        optional_inputs = []
-        if 'optional' in fwd_api.keys():
-            optional_inputs = ParseDispensable(fwd_api['optional'])
-
-        # Collect Original Forward Inputs/Outputs and then perform validation checks
-        forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
-            fwd_args_str, fwd_returns_str)
-        print("Parsed Original Forward Inputs List: ", forward_inputs_list)
-        print("Prased Original Forward Attrs List: ", forward_attrs_list)
-        print("Parsed Original Forward Returns List: ", forward_returns_list)
-
-        forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-            forward_inputs_list, forward_returns_list)
-        print("Generated Forward Input Position Map: ",
-              forward_inputs_position_map)
-        print("Generated Forward Output Position Map: ",
-              forward_outputs_position_map)
-
-        python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
-            fwd_api_name, forward_inputs_position_map, forward_attrs_list,
-            forward_outputs_position_map, optional_inputs, is_forward_only)
-        python_c_function_list.append(python_c_function_str)
-        python_c_function_reg_list.append(python_c_function_reg_str)
-        print("Generated Python-C Function: ", python_c_function_str)
-
-    python_c_functions_str = "\n".join(python_c_function_list)
-    python_c_functions_reg_str = ",\n".join(python_c_function_reg_list)
+    api_yaml_paths = args.api_yaml_path.split(",")
+
+    python_c_functions_reg_str = ""
+    python_c_functions_str = ""
+
+    for i in range(len(api_yaml_paths)):
+        api_yaml_path = api_yaml_paths[i]
+
+        if "sparse" in api_yaml_path:
+            namespace = "sparse"
+        else:
+            namespace = ""
+
+        fwd_api_list = ReadFwdFile(api_yaml_path)
+
+        python_c_function_list = []
+        python_c_function_reg_list = []
+        for fwd_api in fwd_api_list:
+
+            # We only generate Ops with grad
+            is_forward_only = False
+            if 'backward' not in fwd_api.keys():
+                is_forward_only = True
+
+            assert 'api' in fwd_api.keys()
+            assert 'args' in fwd_api.keys()
+            assert 'output' in fwd_api.keys()
+
+            fwd_api_name = fwd_api['api']
+            fwd_args_str = fwd_api['args']
+            fwd_returns_str = fwd_api['output']
+
+            if fwd_api_name in skipped_fwd_api_names:
+                continue
+
+            # Parse Dispensable Inputs
+            optional_inputs = []
+            if 'optional' in fwd_api.keys():
+                optional_inputs = ParseDispensable(fwd_api['optional'])
+
+            # Collect Original Forward Inputs/Outputs and then perform validation checks
+            forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
+                fwd_args_str, fwd_returns_str)
+            print("Parsed Original Forward Inputs List: ", forward_inputs_list)
+            print("Prased Original Forward Attrs List: ", forward_attrs_list)
+            print("Parsed Original Forward Returns List: ",
+                  forward_returns_list)
+
+            forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
+                forward_inputs_list, forward_returns_list)
+            print("Generated Forward Input Position Map: ",
+                  forward_inputs_position_map)
+            print("Generated Forward Output Position Map: ",
+                  forward_outputs_position_map)
+
+            python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
+                fwd_api_name, forward_inputs_position_map, forward_attrs_list,
+                forward_outputs_position_map, optional_inputs, is_forward_only)
+            python_c_function_list.append(python_c_function_str)
+            python_c_function_reg_list.append(python_c_function_reg_str)
+            print("Generated Python-C Function: ", python_c_function_str)
+
+        # Append Namespace
+        python_c_functions_reg_str += ",\n".join(
+            python_c_function_reg_list) + ","
+        python_c_functions = "\n".join(python_c_function_list)
+        if len(namespace) > 0:
+            python_c_functions_str += f"""namespace {namespace} {{
+    {python_c_functions}
+}}
+"""
+
+        else:
+            python_c_functions_str += python_c_functions
 
     python_c_str = GeneratePythonCWrappers(python_c_functions_str,
                                            python_c_functions_reg_str)
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
new file mode 100644
index 00000000000000..6f8bccd64e45f0
--- /dev/null
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/to_static/run_program_op_node.h"
+#include "paddle/fluid/eager/utils.h"
+
+inline void run_program_dygraph_function(
+    const std::vector<paddle::experimental::Tensor>& x,
+    const std::vector<paddle::experimental::Tensor>& params,
+    std::vector<paddle::experimental::Tensor*>& out,     // NOLINT
+    std::vector<paddle::framework::Scope*>& step_scope,  // NOLINT
+    std::vector<paddle::experimental::Tensor*>& dout,    // NOLINT
+    const paddle::framework::AttributeMap& attrs) {
+  VLOG(2) << "start run run_program";
+  // Call forward function
+  RunProgramAPI(x, params, out, step_scope, dout, attrs);
+  VLOG(2) << "start run run_program grad";
+
+  // Prepare Autograd Meta
+  auto deref_out = details::DereferenceTensors(out);
+  std::vector<egr::AutogradMeta*> p_autograd_x =
+      egr::EagerUtils::nullable_autograd_meta(x);
+  std::vector<egr::AutogradMeta*> p_autograd_params =
+      egr::EagerUtils::nullable_autograd_meta(params);
+  std::vector<egr::AutogradMeta*> p_autograd_outs =
+      egr::EagerUtils::nullable_autograd_meta(deref_out);
+
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
+      trace_backward, &p_autograd_x, &p_autograd_params);
+
+  if (require_any_grad) {
+    std::vector<std::string> out_names;
+    for (auto& t : deref_out) {
+      out_names.emplace_back(t.name());
+    }
+
+    egr::EagerUtils::PassStopGradient(false, &p_autograd_outs);
+    // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad])
+    auto grad_node = std::make_shared<GradNodeRunProgram>(1, 2);
+
+    grad_node->SetFwdOutNames(out_names);
+    // Set Attributes
+    grad_node->SetAttrMap(attrs);
+    // Set TensorWrappers
+    grad_node->SetFwdX(x);
+    grad_node->SetFwdParams(params);
+    grad_node->SetStepScope(step_scope);
+
+    // Set Grad out rank as same as fwd input and set stop gradient to bwd
+    grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0);
+    grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1);
+
+    grad_node->SetGradInMeta(&p_autograd_outs, 0);
+    // Set Next Edges
+    grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0);
+    grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1);
+
+    egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0);
+
+    // Set History for output set current Grad Node for
+    egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node);
+    egr::EagerUtils::CheckAndRetainGrad(deref_out);
+  }
+}
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
new file mode 100644
index 00000000000000..ae5d86664a346f
--- /dev/null
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -0,0 +1,468 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tensor_wrapper.h"
+
+#include "paddle/fluid/operators/run_program_op.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace details {
+using Tensor = paddle::experimental::Tensor;
+
+static std::vector<Tensor> DereferenceTensors(
+    const std::vector<Tensor *> &tensor_ptr) {
+  std::vector<Tensor> res;
+  for (auto *t : tensor_ptr) {
+    res.emplace_back(*t);
+  }
+  return res;
+}
+
+static std::vector<std::string> GetTensorsName(const std::vector<Tensor> &ins) {
+  std::vector<std::string> in_names;
+  for (auto &in_t : ins) {
+    in_names.emplace_back(in_t.name());
+  }
+  return in_names;
+}
+
+static std::vector<std::string> GetTensorsName(
+    const std::vector<Tensor *> &ins) {
+  std::vector<std::string> in_names;
+  for (auto *in_t : ins) {
+    in_names.emplace_back(in_t->name());
+  }
+  return in_names;
+}
+
+static void CheckInputVarStatus(const Tensor &tensor) {
+  PADDLE_ENFORCE_EQ(
+      tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true,
+      paddle::platform::errors::InvalidArgument(
+          "The input tensor %s of "
+          "RunProgram(Grad)Op holds "
+          "wrong type. Expect type is DenseTensor.",
+          tensor.name()));
+
+  PADDLE_ENFORCE_EQ(tensor.initialized(), true,
+                    paddle::platform::errors::InvalidArgument(
+                        "The tensor in input tensor %s of "
+                        "RunProgram(Grad)Op "
+                        "is not initialized.",
+                        tensor.name()));
+}
+
+static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
+                                 const Tensor &dst_tensor) {
+  auto name = dst_tensor.name();
+  PADDLE_ENFORCE_EQ(dst_tensor.defined(), true,
+                    paddle::platform::errors::InvalidArgument(
+                        "dst_tensor shall be defined."));
+
+  if (phi::DenseTensor::classof(dst_tensor.impl().get())) {
+    auto &src_tensor = src_var.Get<phi::DenseTensor>();
+    PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The output tensor %s get from "
+                          "RunProgram(Grad)Op's internal scope holds "
+                          "wrong type. Expect type is DenseTensor",
+                          name));
+    PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The tensor in output tensor %s get from "
+                          "RunProgram(Grad)Op's internal "
+                          "scope is not initialized.",
+                          name));
+  } else if (phi::SelectedRows::classof(dst_tensor.impl().get())) {
+    auto &src_tensor = src_var.Get<phi::SelectedRows>();
+    PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The output tensodfr %s get from "
+                          "RunProgram(Grad)Op's internal scope holds "
+                          "wrong type. Expect type is SelectedRows",
+                          name));
+    PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The tensor in output tensor %s get from "
+                          "RunProgram(Grad)Op's "
+                          "internal scope is not initialized.",
+                          name));
+
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "The RunProgram(Grad)Op only support output "
+        "variable of type LoDTensor or SelectedRows",
+        name));
+  }
+}
+
+static void ShareTensorsIntoScope(const std::vector<Tensor> &tensors,
+                                  paddle::framework::Scope *scope) {
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto name = tensors[i].name();
+    if (name == "Fake_var" || !tensors[i].is_initialized()) {
+      continue;
+    }
+    auto *var = scope->Var(name);
+    CheckInputVarStatus(tensors[i]);
+    // share tensor
+    auto tensor_base = tensors[i].impl();
+    if (phi::DenseTensor::classof(tensor_base.get())) {
+      auto *dst_tensor = var->GetMutable<phi::DenseTensor>();
+      auto t = std::dynamic_pointer_cast<phi::DenseTensor>(tensor_base);
+      *dst_tensor = *t;
+    } else if (phi::SelectedRows::classof(tensor_base.get())) {
+      auto *dst_tensor = var->GetMutable<phi::SelectedRows>();
+      auto t = std::dynamic_pointer_cast<phi::SelectedRows>(tensor_base);
+      *dst_tensor = *t;
+    }
+  }
+}
+
+static void ShareTensorsFromScope(
+    const std::vector<Tensor *> &tensors,
+    const paddle::framework::BlockDesc &global_block,
+    paddle::framework::Scope *scope) {
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all
+    // parameters before generating out_tmp have no @GRAD, it will raise error
+    // because we can't find them in scope. So we skip sharing these vars or
+    // var@GRAD if they don't appear in global block.
+    auto &name = tensors[i]->name();
+    if (name == paddle::framework::kEmptyVarName || name == "Fake_var" ||
+        !global_block.HasVar(name)) {
+      VLOG(2) << "find tensor name is " << name << ", skip it!";
+      continue;
+    }
+    // NOTE: Here skip not found var is dangerous, if a bug is caused here,
+    // the result is grad calculation error, which will be very hidden!
+    auto *var = scope->FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::NotFound(
+                                     "The output tensor %s is not in "
+                                     "RunProgram(Grad)Op'"
+                                     "s internal scope.",
+                                     name));
+    CheckOutputVarStatus(*var, *tensors[i]);
+    // share tensor
+    // TODO(dev): Determine Tensor type by scope.var
+    // auto tensor_base = tensors[i]->impl();
+    // if (phi::DenseTensor::classof(tensor_base.get())) {
+    if (var->IsType<phi::DenseTensor>()) {
+      auto &src_tensor = var->Get<phi::DenseTensor>();
+      auto *dst_tensor = const_cast<phi::DenseTensor *>(
+          dynamic_cast<const phi::DenseTensor *>(tensors[i]->impl().get()));
+      VLOG(2) << "share " << name << " from scope";
+      *dst_tensor = src_tensor;
+    } else if (var->IsType<phi::SelectedRows>()) {
+      // } else if (phi::SelectedRows::classof(tensor_base.get())) {
+      auto &src_tensor = var->Get<phi::SelectedRows>();
+      auto *dst_tensor = const_cast<phi::SelectedRows *>(
+          dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get()));
+      *dst_tensor = src_tensor;
+    }
+  }
+}
+
+}  // namespace details
+
+inline void RunProgramAPI(
+    const std::vector<paddle::experimental::Tensor> &x,
+    const std::vector<paddle::experimental::Tensor> &params,
+    std::vector<paddle::experimental::Tensor *> &out,     // NOLINT
+    std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    std::vector<paddle::experimental::Tensor *> &dout,    // NOLINT
+    const paddle::framework::AttributeMap &attrs) {
+  VLOG(2) << "RunProgramOpKernel Compute";
+  auto start_op_index = BOOST_GET_CONST(int64_t, attrs.at("start_op_index"));
+  auto end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
+  auto is_test = BOOST_GET_CONST(bool, attrs.at("is_test"));
+  auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id"));
+
+  // NOTE(chenweihang): In order not to add new variable type, use vector
+  // here. Originally, here can use scope directly.
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(), 1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+
+  // Step 2. prepare executor and init persistable variables
+
+  // NOTE(Aurelius84): While training some models, forward can be called many
+  // times and then apply backpropagation all at once, such as Reinforcement
+  // Learning. Tensor data in multi-step training should be saved into single
+  // scope separately. Otherwise, the gradients can be miscalculated because
+  // always using the Tensor data of the last step in forward.
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+  VLOG(2) << "The number of sub scopes before forward: "
+          << out_scope_vec->front()->kids().size();
+  paddle::framework::Scope &scope = global_inner_scope->NewScope();
+
+  // share input_vars & parameters into scope
+  details::ShareTensorsIntoScope(x, &scope);
+  details::ShareTensorsIntoScope(params, &scope);
+
+  auto *global_block =
+      BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
+  const auto &place = egr::Controller::Instance().GetExpectedPlace();
+
+  if (end_op_index > start_op_index) {
+    auto input_names = details::GetTensorsName(x);
+    auto output_names = details::GetTensorsName(out);
+    auto dout_names = details::GetTensorsName(dout);
+    auto *program = global_block->Program();
+
+    auto cache_info = paddle::framework::GetExecutorInfoFromCache(
+        *program, place, start_op_index, end_op_index,
+        /*is_grad=*/false, program_id, &scope);
+    auto &parallel_executor = cache_info.first;
+    // all out_vars are skip_eager_var
+    auto &skip_eager_delete_vars =
+        paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
+            program_id, false);
+    if (cache_info.second /*is_new_created*/) {
+      parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names);
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    output_names.begin(), output_names.end());
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    dout_names.begin(), dout_names.end());
+      paddle::framework::details::ParseSafeEagerDeletionSkipVars(
+          *program, end_op_index, output_names, &skip_eager_delete_vars);
+    }
+
+    // Step 3. run ops
+    parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
+  }
+  // Step 4. Get Output
+  details::ShareTensorsFromScope(out, *global_block, &scope);
+  details::ShareTensorsFromScope(dout, *global_block, &scope);
+
+  // Debug info: scope info when run end
+  VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+  // Step 5. Drop all children scopes while testing.
+  if (is_test) {
+    out_scope_vec->front()->DropKids();
+  }
+  VLOG(2) << "The number of sub scopes after forward: "
+          << out_scope_vec->front()->kids().size();
+  // #ifdef PADDLE_WITH_MKLDNN
+  //     if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
+  // #endif
+}
+
+inline void RunProgramGradAPI(
+    const std::vector<paddle::experimental::Tensor> &x,
+    const std::vector<paddle::experimental::Tensor> &params,
+    const std::vector<paddle::experimental::Tensor> &out_grad,
+    const std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    const paddle::framework::AttributeMap &attrs,
+    std::vector<paddle::experimental::Tensor *> &x_grad,      // NOLINT
+    std::vector<paddle::experimental::Tensor *> &params_grad  // NOLINT
+    ) {
+  // if all output vars are set to stop_gradient, grad op no need to executed
+  if (x_grad.empty() && params_grad.empty()) return;
+
+  // TODO(dev): Remove this line hard code. And need to deal with the out_grad
+  // name problem.
+  // const_cast<paddle::experimental::Tensor &>(out_grad[0])
+  //     .set_name("matmul_v2_0.tmp_0@GRAD");
+
+  auto *global_block =
+      BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block"));
+  auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index"));
+
+  auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id"));
+  // NOTE: skip `shape` and `fill_constant` op created by
+  // fluid.backward.gradients, one forward output will generate one `shape`
+  // and `fill_constant`
+  int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2);
+  int64_t end_op_index = global_block->OpSize();
+
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(), 1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+  auto sub_scope_num = global_inner_scope->kids().size();
+  VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
+  PADDLE_ENFORCE_GT(sub_scope_num, 0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The OutScope of RunProgramGradOp should hold at "
+                        "least one sub scope."));
+
+  auto &scope = *(global_inner_scope->kids().front());
+  const auto &place = egr::Controller::Instance().GetExpectedPlace();
+
+  if (end_op_index > start_op_index) {
+    auto out_grad_names = details::GetTensorsName(out_grad);
+    // NOTE: after PR22939 [Add double grad] merged, the grad op maker's
+    //   SetOutput will set to None if the input var stop_gradient=True,
+    //   it will cause an NotFound error when ctx.OutputNames() is called
+    std::vector<std::string> x_grad_names;
+    std::vector<std::string> param_grad_names;
+    if (!x_grad.empty()) {
+      x_grad_names = details::GetTensorsName(x_grad);
+    }
+    if (!params_grad.empty()) {
+      param_grad_names = details::GetTensorsName(params_grad);
+    }
+
+    // Step 2. prepare executor and scope
+    auto *program = global_block->Program();
+    auto cache_info = paddle::framework::GetExecutorInfoFromCache(
+        *program, place, start_op_index, end_op_index,
+        /*is_grad*/ true, program_id, &scope);
+    auto &parallel_executor = cache_info.first;
+
+    auto &skip_eager_delete_vars =
+        paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
+            program_id, true);
+    if (cache_info.second /*is_new_created*/) {
+      parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names);
+
+      skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                    x_grad_names.begin(), x_grad_names.end());
+      paddle::framework::details::AppendSkipDeletionVars(
+          param_grad_names, &skip_eager_delete_vars);
+    }
+
+    details::ShareTensorsIntoScope(out_grad, &scope);
+    // Debug info: scope info when run end
+    VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+
+    // Step 3. run ops
+    parallel_executor->RunWithoutFetch(
+        /*skip_eager_delete_vars=*/skip_eager_delete_vars);
+  }
+
+  // Step 4. get outputs
+  details::ShareTensorsFromScope(x_grad, *global_block, &scope);
+  details::ShareTensorsFromScope(params_grad, *global_block, &scope);
+
+  // Step5. drop current scope
+  // global_inner_scope->DeleteScope(&scope);
+  VLOG(2) << "The number of sub scopes after backward: "
+          << global_inner_scope->kids().size();
+}
+
+class GradNodeRunProgram : public egr::GradNodeBase {
+ public:
+  GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+
+  ~GradNodeRunProgram() override = default;
+  // Functor: perform backward computations
+  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
+      const std::vector<std::vector<paddle::experimental::Tensor>> &grads)
+      override {
+    VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
+    PADDLE_ENFORCE_EQ(
+        grads.size(), 1,
+        paddle::platform::errors::InvalidArgument(
+            "The out_grads.size() of RunProgramGradOp should be equal to 1."));
+
+    VLOG(3) << "out_grads[0].size() : " << grads[0].size();
+    std::vector<paddle::experimental::Tensor> x_grad;
+    std::vector<paddle::experimental::Tensor> params_grad;
+    ConstructGradTensors(x_, &x_grad);
+    ConstructGradTensors(params_, &params_grad);
+    std::vector<paddle::experimental::Tensor *> x_grad_ptr;
+    std::vector<paddle::experimental::Tensor *> params_grad_ptr;
+    for (auto &i : x_grad) {
+      x_grad_ptr.emplace_back(&i);
+    }
+    for (auto &i : params_grad) {
+      params_grad_ptr.emplace_back(&i);
+    }
+
+    // auto x_grad_ptr = ConstructGradTensors(x_);
+    // auto params_grad_ptr = ConstructGradTensors(params_);
+
+    PADDLE_ENFORCE_EQ(
+        grads[0].size(), fwd_out_names_.size(),
+        paddle::platform::errors::InvalidArgument(
+            "The grads[0].size() and fwd_out_names_.size() should be equal."));
+    for (size_t i = 0; i < fwd_out_names_.size(); ++i) {
+      const_cast<paddle::experimental::Tensor &>(grads[0][i])
+          .set_name(fwd_out_names_[i] + "@GRAD");
+    }
+
+    RunProgramGradAPI(x_, params_, grads[0], step_scope_, attrs_, x_grad_ptr,
+                      params_grad_ptr);
+    VLOG(3) << "End Eager Backward Node: GradNodeRunProgram";
+    return {x_grad, params_grad};
+    // return {x_grad, details::DereferenceTensors(params_grad_ptr)};
+  }
+
+  // SetAttrMap
+  void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
+    attrs_ = attrs;
+  }
+
+  void SetFwdX(const std::vector<paddle::experimental::Tensor> &tensors) {
+    x_ = tensors;
+  }
+
+  void SetFwdParams(const std::vector<paddle::experimental::Tensor> &tensors) {
+    params_ = tensors;
+  }
+
+  void SetStepScope(const std::vector<paddle::framework::Scope *> &scopes) {
+    step_scope_ = scopes;
+  }
+
+  void SetFwdOutNames(std::vector<std::string> out_names) {
+    fwd_out_names_ = out_names;
+  }
+
+ protected:
+  void ConstructGradTensors(
+      const std::vector<paddle::experimental::Tensor> &fwd_tensors,
+      std::vector<paddle::experimental::Tensor> *grad_tensors) {
+    // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor,
+    // such as: name, tensor type(DenseTensor or SelectedRows).
+    VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
+    for (auto &fwd_t : fwd_tensors) {
+      grad_tensors->emplace_back(fwd_t.impl());
+      auto &grad_t = grad_tensors->back();
+      grad_t.set_name(fwd_t.name() + "@GRAD");
+    }
+  }
+
+  void ConstructGradTensors(
+      const std::vector<paddle::experimental::Tensor> &fwd_tensors) {
+    VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size();
+    for (auto &fwd_t : fwd_tensors) {
+      auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad();
+      grad_tesnor.set_name(fwd_t.name() + "@GRAD");
+    }
+  }
+
+ private:
+  // TensorWrappers
+  std::vector<paddle::experimental::Tensor> x_;
+  std::vector<paddle::experimental::Tensor> params_;
+  std::vector<paddle::framework::Scope *> step_scope_;
+
+  std::vector<std::string> fwd_out_names_;
+
+  // Attribute Map
+  paddle::framework::AttributeMap attrs_;
+};
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index e486799495c7ab..aa92a3b2226c1f 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -443,7 +443,7 @@ cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framewo
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
-set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator phi_custom_kernel)
+set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 66dfb81755f1c9..948eaab40b4f64 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
     coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
     fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
     sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass
-    fix_op_run_order_pass)
+    fix_op_run_order_pass fuse_gemm_epilogue_pass)
 
 if (WITH_CINN)
   set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass)
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index c99200ec98aa8f..fdf74d2f769fcd 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -175,6 +176,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
 #endif
+
+#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
+    AppendPassWithCheck(strategy_.fuse_gemm_epilogue_,
+                        "fuse_gemm_epilogue_pass");
+#endif
     AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                         "fuse_elewise_add_act_pass");
     // for single card training, fuse_all_reduce_ops is unnecessary.
@@ -507,3 +513,6 @@ USE_PASS(mkldnn_placement_pass);
     !defined(_WIN32) && !defined(__APPLE__)
 USE_PASS(fusion_group_pass);
 #endif
+#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
+USE_PASS(fuse_gemm_epilogue_pass);
+#endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 70a083dd70bc3b..5eb584aaefa981 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -124,6 +125,8 @@ struct BuildStrategy {
   paddle::optional<bool> fuse_broadcast_ops_{paddle::none};
   // replace batch_norm with sync_batch_norm.
   bool sync_batch_norm_{false};
+  // Fuse GEMM+Epilogue via cublasLt epilogue.
+  bool fuse_gemm_epilogue_{false};
 
   // mkldnn_enabled_op_types specify the operator type list to
   // use MKLDNN acceleration. It is null in default, means
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 7232a707916dd5..91ef59575c3aa2 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -232,16 +232,8 @@ class CompatMetaTensor : public phi::MetaTensor {
     }
   }
 
-  void share_meta(const MetaTensor& meta_tensor) override {
+  void share_dims(const MetaTensor& meta_tensor) override {
     set_dims(meta_tensor.dims());
-    set_dtype(meta_tensor.dtype());
-    // VarDesc doesn't contains layout, so we cannot share layout
-    // set_layout(meta_tensor.layout());
-
-    // special case 1: share lod of LoDTensor
-    share_lod(meta_tensor);
-
-    // special case 2: share height and rows of SelectedRows in runtime
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
       if (var->IsType<phi::SelectedRows>()) {
@@ -254,6 +246,16 @@ class CompatMetaTensor : public phi::MetaTensor {
     }
   }
 
+  void share_meta(const MetaTensor& meta_tensor) override {
+    set_dtype(meta_tensor.dtype());
+    // VarDesc doesn't contains layout, so we cannot share layout
+    // set_layout(meta_tensor.layout());
+
+    // special case 1: share lod of LoDTensor
+    share_lod(meta_tensor);
+    share_dims(meta_tensor);
+  }
+
  private:
   const LoD& GetRuntimeLoD() const {
     auto* var = BOOST_GET_CONST(Variable*, var_);
diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h
index 64c8371d583ffe..b692b6ffab0801 100644
--- a/paddle/fluid/framework/infershape_utils.h
+++ b/paddle/fluid/framework/infershape_utils.h
@@ -29,7 +29,7 @@ namespace framework {
 phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
                                             const std::string& op_type);
 
-#define DELCARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn)      \
+#define DECLARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn)      \
   struct functor_name : public paddle::framework::InferShapeBase {  \
     void operator()(                                                \
         paddle::framework::InferShapeContext* ctx) const override { \
diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc
index 53dcc19fcbae88..2eeefb19a1aa8c 100644
--- a/paddle/fluid/framework/infershape_utils_test.cc
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -110,9 +110,9 @@ void InferShapeUtilsTestKernel(
 }  // namespace framework
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test,
+DECLARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test,
                             InferShapeUtilsTestInferShapeFunctor,
-                            PT_INFER_META(paddle::framework::TestInferMeta));
+                            PD_INFER_META(paddle::framework::TestInferMeta));
 REGISTER_OPERATOR(infer_shape_utils_test,
                   paddle::framework::InferShapeUtilsTestOp,
                   paddle::framework::InferShapeUtilsTestOpMaker,
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 3f2da8cbeb3adb..623c8a048c2417 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -158,6 +158,7 @@ endif()
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
+cc_library(fuse_gemm_epilogue_pass SRCS fuse_gemm_epilogue_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )
 
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
new file mode 100644
index 00000000000000..f48224cbdc24fe
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc
@@ -0,0 +1,471 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h"
+#include <string>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const {
+  EpiloguePassActivationCache cache;
+
+  graph = FuseLinearActFwd(graph, {"relu", "gelu"}, false, false, &cache);
+  graph = FuseLinearActFwd(graph, {"relu"}, true, true, &cache);
+  graph = FuseLinearActFwd(graph, {"gelu"}, true, false, &cache);
+  graph = FuseLinearFwd(graph, false);
+  graph = FuseLinearFwd(graph, true);
+  graph = FuseLinearActBwd(graph, {"relu_grad"}, true, &cache);
+  graph = FuseLinearActBwd(graph, {"gelu_grad"}, false, &cache);
+  graph = FuseLinearBwd(graph, false);
+  graph = FuseLinearBwd(graph, true);
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph,
+                                               bool is_training) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode(patterns::PDNodeName(scope_name, "x"))
+                ->AsInput()
+                ->assert_is_op_input("matmul_v2", "X");
+  patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act");
+
+  linear_act_pattern(x, {}, is_training, false);
+
+  int found_linear_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle LinearAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern);
+
+    std::vector<int64_t> matmul_x_shape = subgraph.at(x)->Var()->GetShape();
+    std::vector<int64_t> matmul_w_shape = matmul_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_op_desc = matmul_op->Op();
+    if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc))
+      return;
+
+    OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
+    std::string activation = "none";
+    fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
+    fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()});
+    fused_gemm_epilogue_op_desc.SetOutput("Out", {ele_out->Name()});
+    fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
+    fused_gemm_epilogue_op_desc.SetAttr("op_role",
+                                        matmul_op_desc->GetAttr("op_role"));
+    auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node);
+    IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node);
+    IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node);
+    IR_NODE_LINK_TO(gemm_epilogue_node, ele_out);
+
+    GraphSafeRemoveNodes(g, {matmul_op, matmul_out, ele_add_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name()
+            << " -> " << matmul_op->Name() << " -> " << matmul_out->Name()
+            << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name()
+            << " -> " << ele_add_op->Name() << " -> " << ele_out->Name()
+            << "\n\t " << ele_out->Name();
+    found_linear_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_linear_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types,
+    bool is_training, bool is_act_grad_x_from_act,
+    EpiloguePassActivationCache *cache) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode(patterns::PDNodeName(scope_name, "x"))
+                ->AsInput()
+                ->assert_is_op_input("matmul_v2", "X");
+  patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act");
+
+  linear_act_pattern(x, act_types, is_training, is_act_grad_x_from_act);
+
+  int found_linear_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle LinearAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_op, act, linear_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, linear_act_pattern);
+
+    std::vector<int64_t> matmul_x_shape = subgraph.at(x)->Var()->GetShape();
+    std::vector<int64_t> matmul_w_shape = matmul_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_op_desc = matmul_op->Op();
+    if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc))
+      return;
+
+    auto activation = act_op->Op()->Type();
+
+    OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block());
+    fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue");
+    fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()});
+    fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()});
+    fused_gemm_epilogue_op_desc.SetOutput("Out", {act_out->Name()});
+    fused_gemm_epilogue_op_desc.SetAttr("activation", activation);
+    fused_gemm_epilogue_op_desc.SetAttr("op_role",
+                                        matmul_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node);
+    IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node);
+    IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node);
+    IR_NODE_LINK_TO(gemm_epilogue_node, act_out);
+
+    // Only need to check weight.shape[1] for auxiliary pointer
+    // and mark it the act op is fused for backward epilogue fusion.
+    // That because cuBlasLt epilogue's restriction.
+    if (is_training) {
+      int divisor_of_n = activation == "relu" ? 128 : 8;
+      if (matmul_w_shape[1] % divisor_of_n) return;
+
+      VarDesc reserve_space(patterns::PDNodeName(scope_name, "ReserveSpace"));
+      auto *reserve_space_node = g->CreateVarNode(&reserve_space);
+
+      cache->InsertFusedActivation(
+          GetReserveSpaceCacheKey(act_out->Var()->Name(), g->GetBlockId()),
+          reserve_space_node);
+
+      gemm_epilogue_node->Op()->SetOutput("ReserveSpace",
+                                          {reserve_space_node->Name()});
+
+      if (!is_act_grad_x_from_act) {
+        GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, linear_act_pattern);
+        act_grad_op->Op()->RenameInput(ele_out->Name(),
+                                       reserve_space_node->Name());
+        IR_NODE_LINK_TO(reserve_space_node, act_grad_op);
+      }
+      IR_NODE_LINK_TO(gemm_epilogue_node, reserve_space_node);
+    }
+
+    GraphSafeRemoveNodes(g,
+                         {matmul_op, matmul_out, ele_add_op, ele_out, act_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name()
+            << " -> " << matmul_op->Name() << " -> " << matmul_out->Name()
+            << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name()
+            << " -> " << ele_add_op->Name() << " -> " << ele_out->Name()
+            << "\n\t " << ele_out->Name() << " -> " << act_op->Name() << " -> "
+            << act_out->Name();
+    found_linear_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_linear_act_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph,
+                                               bool without_x_gradient) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *dout =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(scope_name, "dout"))
+          ->AsInput()
+          ->assert_is_op_input("elementwise_add_grad", GradVarName("Out"));
+
+  patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern(
+      gpd.mutable_pattern(), "ele_add_matmul_act");
+  ele_add_matmul_act_pattern(dout, {}, without_x_gradient, false);
+
+  int found_ele_add_matmul_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle ElewiseAddMatmulAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw,
+                              ele_add_matmul_act_pattern);
+
+    Node *matmul_grad_dx = nullptr;
+    if (!without_x_gradient) {
+      GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx_ptr, matmul_grad_dx,
+                                ele_add_matmul_act_pattern);
+      matmul_grad_dx = matmul_grad_dx_ptr;
+    }
+
+    std::vector<int64_t> matmul_grad_x_shape = matmul_grad_x->Var()->GetShape();
+    std::vector<int64_t> matmul_grad_w_shape = matmul_grad_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_grad_op_desc = matmul_grad_op->Op();
+    if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape,
+                           matmul_grad_op_desc))
+      return;
+
+    OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
+    std::string activation_grad = "none";
+    fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
+    fused_gemm_epilogue_grad_op_desc.SetInput("DOut",
+                                              {subgraph.at(dout)->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()});
+    if (matmul_grad_dx) {
+      fused_gemm_epilogue_grad_op_desc.SetOutput("DX",
+                                                 {matmul_grad_dx->Name()});
+    }
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DBias",
+                                               {ele_grad_dbias->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad",
+                                             activation_grad);
+    fused_gemm_epilogue_grad_op_desc.SetAttr(
+        "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_grad_node =
+        g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias);
+    if (matmul_grad_dx) {
+      IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dx);
+    }
+
+    GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op});
+
+    std::string matmul_grad_dx_name =
+        matmul_grad_dx != nullptr ? matmul_grad_dx->Name() : " ";
+    VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and "
+            << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name()
+            << " -> " << ele_grad_dx->Name() << " and "
+            << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", "
+            << matmul_grad_x->Name() << " and " << matmul_grad_w->Name()
+            << " -> " << matmul_grad_op->Name() << " -> "
+            << matmul_grad_w->Name() << " and " << matmul_grad_dx_name;
+    found_ele_add_matmul_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_ele_add_matmul_act_count);
+  return graph;
+}
+
+ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_grad_types,
+    bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  const std::string scope_name("gemm_epilogue");
+  FusePassBase::Init(scope_name, graph);
+
+  GraphPatternDetector gpd;
+  auto *dout =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(scope_name, "dout"))
+          ->AsInput()
+          ->assert_is_op_input("elementwise_add_grad", GradVarName("Out"));
+
+  patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern(
+      gpd.mutable_pattern(), "ele_add_matmul_act");
+  ele_add_matmul_act_pattern(dout, act_grad_types, false,
+                             is_act_grad_x_from_act);
+
+  int found_ele_add_matmul_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle ElewiseAddMatmulAct fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx, matmul_grad_dx,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad,
+                              ele_add_matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_grad_dx, act_grad_dx,
+                              ele_add_matmul_act_pattern);
+
+    auto key =
+        GetReserveSpaceCacheKey(matmul_grad_x->Var()->Name(), g->GetBlockId());
+    if (!cache->HasFusedActivation(key)) {
+      return;
+    }
+    auto *reserve_space_node = cache->GetFusedActivationSpace(key);
+
+    std::vector<int64_t> matmul_grad_x_shape = matmul_grad_x->Var()->GetShape();
+    std::vector<int64_t> matmul_grad_w_shape = matmul_grad_w->Var()->GetShape();
+
+    // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear
+    // currently. The conditions below are used to verify wether matmul_v2
+    // is created by paddle.nn.Linear
+    auto matmul_grad_op_desc = matmul_grad_op->Op();
+    if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape,
+                           matmul_grad_op_desc))
+      return;
+
+    auto activation_grad = act_grad_op->Op()->Type();
+
+    OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block());
+    fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad");
+    fused_gemm_epilogue_grad_op_desc.SetInput("DOut",
+                                              {subgraph.at(dout)->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetInput("ReserveSpace",
+                                              {reserve_space_node->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DX", {act_grad_dx->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetOutput("DBias",
+                                               {ele_grad_dbias->Name()});
+    fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad",
+                                             activation_grad);
+    fused_gemm_epilogue_grad_op_desc.SetAttr(
+        "op_role", matmul_grad_op_desc->GetAttr("op_role"));
+
+    auto gemm_epilogue_grad_node =
+        g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, act_grad_dx);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw);
+    IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias);
+    IR_NODE_LINK_TO(reserve_space_node, gemm_epilogue_grad_node);
+
+    GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op,
+                             matmul_grad_dx, act_grad_op});
+
+    VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and "
+            << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name()
+            << " -> " << ele_grad_dx->Name() << " and "
+            << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", "
+            << matmul_grad_x->Name() << " and " << matmul_grad_w->Name()
+            << " -> " << matmul_grad_op->Name() << " -> "
+            << matmul_grad_dx->Name() << " and " << matmul_grad_w->Name()
+            << "\n\t " << matmul_grad_dx->Name() << " -> "
+            << act_grad_op->Name() << " -> " << act_grad_dx->Name();
+    found_ele_add_matmul_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_ele_add_matmul_act_count);
+  return graph;
+}
+
+bool FuseGemmEpiloguePass::IsGemmFromLinear_(
+    const std::vector<int64_t> &x_shape, const std::vector<int64_t> &w_shape,
+    OpDesc *matmul_v2_op) const {
+  if (w_shape.size() != 2 || x_shape.size() < 2) return false;
+  for (auto attr_name :
+       {"fused_reshape_Out", "fused_reshape_X", "fused_reshape_Y",
+        "fused_transpose_Out", "fused_transpose_X", "fused_transpose_Y"}) {
+    if (matmul_v2_op->HasAttr(attr_name)) {
+      std::vector<int> tmp_vec =
+          BOOST_GET_CONST(std::vector<int>, matmul_v2_op->GetAttr(attr_name));
+      if (tmp_vec.size() > 0) return false;
+    }
+  }
+  if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) ||
+      BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y")))
+    return false;
+
+  return true;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_gemm_epilogue_pass,
+              paddle::framework::ir::FuseGemmEpiloguePass);
diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
new file mode 100644
index 00000000000000..575ffee73d60e9
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <mutex>
+#include <string>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the ElewiseAdd and activation
+ */
+class Graph;
+class Node;
+
+class EpiloguePassActivationCache {
+ public:
+  EpiloguePassActivationCache() {}
+
+  EpiloguePassActivationCache(const EpiloguePassActivationCache &) = delete;
+  void operator=(const EpiloguePassActivationCache &) = delete;
+
+  bool HasFusedActivation(const std::string &key) const {
+    return fused_activation_space_map_.count(key);
+  }
+
+  ir::Node *GetFusedActivationSpace(const std::string &key) {
+    if (HasFusedActivation(key)) {
+      return fused_activation_space_map_.find(key)->second;
+    }
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The key (%d) of EpiloguePassActivationCache does not exist.", key));
+  }
+
+  void InsertFusedActivation(const std::string &key, ir::Node *const value) {
+    if (!HasFusedActivation(key)) {
+      mtx.lock();
+      fused_activation_space_map_.insert({key, value});
+      mtx.unlock();
+    } else {
+      PADDLE_THROW(platform::errors::AlreadyExists(
+          "The key (%d) of EpiloguePassActivationCache already exist.", key));
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, ir::Node *> fused_activation_space_map_;
+  std::mutex mtx;
+};
+
+class FuseGemmEpiloguePass : public FusePassBase {
+ public:
+  virtual ~FuseGemmEpiloguePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+  ir::Graph *FuseLinearFwd(ir::Graph *graph, bool is_training) const;
+  ir::Graph *FuseLinearActFwd(ir::Graph *graph,
+                              const std::unordered_set<std::string> &act_types,
+                              bool is_training, bool is_act_grad_x_from_act,
+                              EpiloguePassActivationCache *cache) const;
+  ir::Graph *FuseLinearBwd(ir::Graph *graph, bool without_x_gradient) const;
+  ir::Graph *FuseLinearActBwd(
+      ir::Graph *graph, const std::unordered_set<std::string> &act_grad_types,
+      bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const;
+
+ private:
+  bool IsGemmFromLinear_(const std::vector<int64_t> &x_shape,
+                         const std::vector<int64_t> &w_shape,
+                         OpDesc *matmul_v2_op) const;
+  const std::string GetReserveSpaceCacheKey(const std::string var_name,
+                                            int block_id) const {
+    return std::to_string(block_id) + var_name;
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 72f0f790043dbc..18068e22b7f3c3 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1491,31 +1491,6 @@ PDNode *patterns::BatchNormAddActGrad::operator()(
   return bn_grad;
 }
 
-PDNode *patterns::ElewiseAddAct::operator()(
-    paddle::framework::ir::PDNode *ele_x_var,
-    std::unordered_set<std::string> act_types) {
-  auto *ele_y_var = pattern->NewNode(ele_y_repr())
-                        ->assert_is_op_input("elementwise_add", "Y");
-
-  auto *ele_add =
-      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
-
-  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
-                          ->assert_is_op_output("elementwise_add", "Out");
-
-  ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
-
-  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
-
-  auto *act_out_var =
-      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
-
-  ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var});
-  act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
-
-  return act_out_var;
-}
-
 PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
     paddle::framework::ir::PDNode *d_act_out_var,
     std::unordered_set<std::string> act_types) {
@@ -1556,6 +1531,159 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
   return ele_add_grad;
 }
 
+PDNode *patterns::ElewiseAddAct::operator()(
+    paddle::framework::ir::PDNode *ele_x_var,
+    std::unordered_set<std::string> act_types) {
+  auto *ele_y_var = pattern->NewNode(ele_y_repr())
+                        ->assert_is_op_input("elementwise_add", "Y");
+
+  auto *ele_add =
+      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
+
+  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
+                          ->assert_is_op_output("elementwise_add", "Out");
+
+  ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
+
+  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
+
+  auto *act_out_var =
+      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
+
+  ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var});
+  act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
+
+  return act_out_var;
+}
+
+PDNode *patterns::LinearAct::operator()(
+    paddle::framework::ir::PDNode *linear_x_var,
+    const std::unordered_set<std::string> &act_types, bool with_grad_link,
+    bool is_act_grad_x_from_act) {
+  auto *matmul_w_var =
+      pattern->NewNode(matmul_w_repr())->assert_is_op_input("matmul_v2", "Y");
+
+  auto *matmul = pattern->NewNode(matmul_repr())->assert_is_op("matmul_v2");
+
+  auto *matmul_out_var = pattern->NewNode(matmul_out_repr())
+                             ->assert_is_op_output("matmul_v2", "Out");
+
+  matmul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add", "X");
+
+  auto *ele_bias_var = pattern->NewNode(ele_bias_repr())
+                           ->assert_is_op_input("elementwise_add", "Y");
+
+  auto *ele_add =
+      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
+
+  auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
+                          ->assert_is_op_output("elementwise_add", "Out");
+
+  matmul->LinksFrom({linear_x_var, matmul_w_var}).LinksTo({matmul_out_var});
+  ele_add->LinksFrom({matmul_out_var, ele_bias_var}).LinksTo({ele_out_var});
+
+  if (with_grad_link) {
+    matmul_out_var->assert_is_op_input("elementwise_add_grad", "X");
+    auto *elementwise_add_grad_op = pattern->NewNode("elementwise_add_grad")
+                                        ->assert_is_op("elementwise_add_grad");
+    elementwise_add_grad_op->LinksFrom({matmul_out_var});
+  }
+
+  if (act_types.size() > 0) {
+    ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
+
+    auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
+    auto *act_out_var = pattern->NewNode(act_out_repr())
+                            ->assert_is_ops_output(act_types, "Out");
+
+    act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
+
+    if (with_grad_link && !is_act_grad_x_from_act) {
+      std::unordered_set<std::string> act_grad_types;
+      for (const auto &act : act_types) {
+        std::string act_grad(act);
+        act_grad.append("_grad");
+        act_grad_types.insert(act_grad);
+      }
+
+      ele_out_var->assert_is_ops_input(act_grad_types, "X");
+      auto *act_grad_op =
+          pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
+      act_grad_op->LinksFrom({ele_out_var});
+    }
+
+    return act_out_var;
+  }
+
+  return ele_out_var;
+}
+
+PDNode *patterns::ElewiseAddMatmulAct::operator()(
+    paddle::framework::ir::PDNode *dout_var,
+    const std::unordered_set<std::string> &act_grad_types,
+    bool without_x_gradient, bool is_act_grad_x_from_act) {
+  auto *ele_grad_bias_var =
+      pattern->NewNode(ele_grad_bias_repr())
+          ->assert_is_op_input("elementwise_add_grad", "Y");
+  auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr())
+                           ->assert_is_op("elementwise_add_grad");
+  auto *ele_grad_dx_var =
+      pattern->NewNode(ele_grad_dx_repr())
+          ->assert_is_op_output("elementwise_add_grad", GradVarName("X"));
+  auto *ele_grad_dbias_var =
+      pattern->NewNode(ele_grad_dbias_repr())
+          ->assert_is_op_output("elementwise_add_grad", GradVarName("Y"));
+  ele_add_grad->LinksFrom({dout_var, ele_grad_bias_var})
+      .LinksTo({ele_grad_dx_var, ele_grad_dbias_var});
+
+  ele_grad_dx_var->AsIntermediate()->assert_is_op_input("matmul_v2_grad",
+                                                        GradVarName("Out"));
+
+  auto *matmul_grad_x_var = pattern->NewNode(matmul_grad_x_repr())
+                                ->assert_is_op_input("matmul_v2_grad", "X");
+  auto *matmul_grad_w_var = pattern->NewNode(matmul_grad_w_repr())
+                                ->assert_is_op_input("matmul_v2_grad", "Y");
+  auto *matmul_grad =
+      pattern->NewNode(matmul_grad_repr())->assert_is_op("matmul_v2_grad");
+  auto *matmul_grad_dx_var =
+      pattern->NewNode(matmul_grad_dx_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("X"));
+  auto *matmul_grad_dw_var =
+      pattern->NewNode(matmul_grad_dw_repr())
+          ->assert_is_op_output("matmul_v2_grad", GradVarName("Y"));
+  matmul_grad->LinksFrom(
+      {ele_grad_dx_var, matmul_grad_x_var, matmul_grad_w_var});
+  if (without_x_gradient) {
+    matmul_grad->LinksTo({matmul_grad_dw_var});
+  } else {
+    matmul_grad->LinksTo({matmul_grad_dx_var, matmul_grad_dw_var});
+  }
+
+  if (!without_x_gradient && act_grad_types.size() > 0) {
+    matmul_grad_dx_var->AsIntermediate()->assert_is_ops_input(
+        act_grad_types, GradVarName("Out"));
+
+    auto *act_grad =
+        pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
+    auto *act_grad_dx_var =
+        pattern->NewNode(act_grad_dx_repr())
+            ->assert_is_ops_output(act_grad_types, GradVarName("X"));
+
+    auto *act_grad_x_var = matmul_grad_x_var;
+    if (!is_act_grad_x_from_act) {
+      auto *ele_out_var = pattern->NewNode(ele_out_repr())
+                              ->assert_is_ops_input(act_grad_types, "X");
+      act_grad_x_var = ele_out_var;
+    }
+
+    act_grad->LinksFrom({matmul_grad_dx_var, act_grad_x_var})
+        .LinksTo({act_grad_dx_var});
+    return act_grad;
+  }
+
+  return matmul_grad;
+}
+
 // conv_type: conv2d, conv3d, conv2d_transpose
 PDNode *patterns::ConvBias::operator()(
     paddle::framework::ir::PDNode *conv_input, std::string conv_type) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 654b47fc97963e..062d2f9dedce65 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -885,6 +885,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
   PATTERN_DECL_NODE(ele_y);
 };
 
+// The following patterns are used to fuse linear and act (ReLu or GeLU)
+// formula: act(F.linear(x))
+// op: matmul_v2 + elementwise_add + act
+// named nodes: matmul, elementwise_add, act
+//              matmul_w, matmul_out
+//              ele_bias, elewise_add_out, act_out
+struct LinearAct : public PatternBase {
+  LinearAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "linear_act") {}
+
+  PDNode* operator()(PDNode* x,
+                     const std::unordered_set<std::string>& act_types,
+                     bool with_grad_link, bool is_act_grad_x_from_act);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(matmul);
+  PATTERN_DECL_NODE(ele_add);
+  PATTERN_DECL_NODE(act);
+  PATTERN_DECL_NODE(act_grad);
+  // declare variable node's name
+  PATTERN_DECL_NODE(matmul_w);
+  PATTERN_DECL_NODE(matmul_out);
+  PATTERN_DECL_NODE(elewise_add_out);
+  PATTERN_DECL_NODE(ele_bias);
+  PATTERN_DECL_NODE(act_out);
+};
+
+// The following patterns are used to fuse linear_grad and act_grad (ReLu or
+// GeLU)
+// formula: the backward of F.linear( act(x) )
+// op: elementwise_add_grad + matmul_v2_grad + act_grad
+// named nodes: ele_add_grad, matmul_grad, act_grad
+//              ele_grad_bias, ele_grad_dx, ele_grad_dbias
+//              matmul_grad_x, matmul_grad_dx, matmul_grad_dx
+//              matmul_grad_dw, act_grad_dx
+struct ElewiseAddMatmulAct : public PatternBase {
+  ElewiseAddMatmulAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elewiseadd_matmul_act") {}
+
+  PDNode* operator()(PDNode* x,
+                     const std::unordered_set<std::string>& act_grad_types,
+                     bool without_x_gradient, bool is_act_grad_x_from_act);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(ele_add_grad);
+  PATTERN_DECL_NODE(matmul_grad);
+  PATTERN_DECL_NODE(act_grad);
+  // declare variable node's name
+  PATTERN_DECL_NODE(ele_out);
+  PATTERN_DECL_NODE(ele_grad_bias);
+  PATTERN_DECL_NODE(ele_grad_dx);
+  PATTERN_DECL_NODE(ele_grad_dbias);
+  PATTERN_DECL_NODE(matmul_grad_x);
+  PATTERN_DECL_NODE(matmul_grad_w);
+  PATTERN_DECL_NODE(matmul_grad_dx);
+  PATTERN_DECL_NODE(matmul_grad_dw);
+  PATTERN_DECL_NODE(act_grad_dx);
+};
+
 // Conv with Elementwise_add as bias
 // op: conv + elementwise_add
 // named nodes:
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index 0a95444f852dd0..d578ada0db00fe 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -15,8 +15,9 @@
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
 
 #include <gtest/gtest.h>
-#include <boost/logic/tribool.hpp>
 #include <unordered_set>
+
+#include <boost/logic/tribool.hpp>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -27,7 +28,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(leaky_relu);
 USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
 USE_OP(gelu);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP(tanh);
 USE_OP_DEVICE_KERNEL(tanh, MKLDNN);
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 2c3359ffa8e46f..a69cc0d6b866d0 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -46,7 +46,7 @@ USE_OP(matmul_grad);
 USE_OP(square);
 USE_OP(transpose2_grad);
 USE_OP(concat_grad);
-USE_OP(elementwise_mul_grad);
+USE_OP_ITSELF(elementwise_mul_grad);
 USE_OP(sigmoid_grad);
 USE_OP(tanh_grad);
 USE_OP(sum);
@@ -54,7 +54,7 @@ USE_OP(slice_grad);
 USE_OP(lookup_table_grad);
 USE_OP(sqrt);
 USE_OP(elementwise_max);
-USE_OP(elementwise_div);
+USE_OP_ITSELF(elementwise_div);
 USE_OP(sgd);
 USE_OP(squared_l2_norm);
 USE_OP(memcpy_h2d);
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index c45bf32d8b710c..eb40a49b4066a7 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -286,8 +286,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
     return 0;                                                            \
   }
 
-#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OPERATOR(op_type, op_class, op_maker_class, \
+#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...) \
+  REGISTER_OPERATOR(op_type, op_class, __VA_ARGS__, \
         paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,   \
         paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index eff6d9a9102d2b..f8e30c1ee294ec 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -539,6 +539,20 @@ bool ExecutionContext::HasInput(const std::string& name) const {
   return var != nullptr;
 }
 
+bool ExecutionContext::HasInputs(const std::string& name) const {
+  const auto& ins = ctx_.inputs;
+  auto it = ins.find(name);
+  if (it == ins.end() || it->second.empty()) {
+    return false;
+  }
+  for (const auto* input : it->second) {
+    if (input == nullptr) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool ExecutionContext::HasOutput(const std::string& name) const {
   auto* var = OutputVar(name);
   return var != nullptr;
@@ -2189,6 +2203,51 @@ void OperatorWithKernel::BuildPhiKernelContext(
             std::move(experimental::MakePhiScalarFromVar(*ins_vector.front())));
       }
 
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = Attrs().at(attr_names[i]);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct KernelContext.",
+            attr_names[i]));
+      }
     } else {
       // TODO(chenweihang): support other attrs later
       auto& attr = Attrs().at(attr_names[i]);
@@ -2212,7 +2271,11 @@ void OperatorWithKernel::BuildPhiKernelContext(
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int>))) {
+            std::type_index(typeid(std::vector<int64_t>))) {
+          pt_kernel_context->EmplaceBackAttr(
+              BOOST_GET_CONST(std::vector<int64_t>, attr));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::vector<int>))) {
           // Emplace Back Attr according to the type of Phi_Kernel args.
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e33d4feb82a9e7..1a1171f1dba4d7 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -295,6 +295,8 @@ class ExecutionContext {
 
   virtual bool HasInput(const std::string& name) const;
 
+  virtual bool HasInputs(const std::string& name) const;
+
   virtual bool HasOutput(const std::string& name) const;
 
   virtual size_t InputSize(const std::string& name) const {
@@ -449,7 +451,7 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
       : ctx_(ctx) {}
 
   bool HasInput(const std::string& name) const override {
-    return ctx_.HasInput(name);
+    return ctx_.HasInputs(name);
   }
 
   bool HasOutput(const std::string& name) const override {
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index bf9d1baaf394f0..47dffd47b7cbbf 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -675,7 +675,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
 
 USE_PASS(build_cinn_pass);
 USE_OP(mul);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(relu_grad);
+USE_OP_ITSELF(relu_grad);
 USE_OP_ITSELF(elementwise_add_grad);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index e8badab27b9b97..cdccc4c5546900 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -301,5 +301,5 @@ TEST(CinnCompilerTest, Compile) {
 USE_PASS(build_cinn_pass);
 USE_PASS(graph_viz_pass);
 USE_OP(mul);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index e1ce705533ab4b..3d8a5ab21f00fc 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -33,6 +33,7 @@ if(NOT WIN32)
     endif()
     if(WITH_CNCL)
         cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits)
+	cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
     if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL)
         cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
@@ -41,7 +42,7 @@ if(NOT WIN32)
 endif(NOT WIN32)
 if(WITH_GLOO)
     cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits)
-    if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) ))
+    if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) ))
         cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
 endif()
diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h
index fe5ac73b004691..fbc47f81fd3316 100644
--- a/paddle/fluid/imperative/execution_context.h
+++ b/paddle/fluid/imperative/execution_context.h
@@ -133,6 +133,11 @@ class DygraphExecutionContext : public framework::ExecutionContext {
     return (it != var_map_in_.end() && it->second.size() > 0);
   }
 
+  bool HasInputs(const std::string& name) const override {
+    auto it = var_map_in_.find(name);
+    return (it != var_map_in_.end() && it->second.size() > 0);
+  }
+
   bool HasOutput(const std::string& name) const override {
     auto it = var_map_out_.find(name);
     return (it != var_map_out_.end() && it->second.size() > 0);
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 2317bfdd7c0d5e..bae49fb381a475 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -247,6 +247,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 
 #ifdef PADDLE_WITH_XPU_KP
+  expected_kernel_key.place_ = platform::XPUPlace();
   bool use_xpu_kp_kernel_rt =
       FLAGS_run_kp_kernel &&
       paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 30dbe07d7afca6..d7c0c8cc547e6b 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -332,6 +332,7 @@ void BuildDygraphPhiKernelContext(
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
+    VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
     if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
       if (attrs.find(attr_names[i]) !=
           attrs.end()) {  // shape is in the attribute
@@ -409,6 +410,60 @@ void BuildDygraphPhiKernelContext(
             experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
       }
 
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<bool>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<bool>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct KernelContext.",
+            attr_names[i]));
+      }
     } else {
       // TODO(chenweihang): support other attrs later
       auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
@@ -432,7 +487,11 @@ void BuildDygraphPhiKernelContext(
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
-            std::type_index(typeid(std::vector<int>))) {
+            std::type_index(typeid(std::vector<int64_t>))) {
+          kernel_ctx->EmplaceBackAttr(
+              BOOST_GET_CONST(std::vector<int64_t>, attr));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(std::vector<int>))) {
           // Emplace Back Attr according to the type of Phi_Kernel args.
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 3a6365b2af21ae..fec9afbf3b403c 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -31,7 +31,7 @@ namespace imperative {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
 // div the nranks
 void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
   framework::Tensor *tensor =
@@ -67,6 +67,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
 #ifdef PADDLE_WITH_XPU_BKCL
 // TODO(liuyuhui) support xpu about div nranks in the future
 #endif
+  } else if (platform::is_mlu_place(tensor->place())) {
+    // TODO(zhangna)
+    VLOG(4) << "divnrank for mlu not support yet";
   }
 }
 
@@ -222,6 +225,56 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 }
 #endif
 
+#ifdef PADDLE_WITH_CNCL
+// context is used to select the stream for concat
+template <>
+void ConcatTensorsWithType<platform::MLUDeviceContext>(
+    const platform::MLUDeviceContext &context,
+    const std::vector<framework::Tensor> &dense_tensors_,
+    framework::Variable *p_dense_contents,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP16:
+      ConcatTensorsForAllReduce<platform::MLUDeviceContext, platform::float16>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    case framework::proto::VarType::FP32:
+      ConcatTensorsForAllReduce<platform::MLUDeviceContext, float>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+
+// context is used to select the stream for split
+template <>
+void SplitTensorsWithType<platform::MLUDeviceContext>(
+    const platform::MLUDeviceContext &context,
+    framework::Variable *p_dense_contents,
+    std::vector<framework::Tensor> *p_dense_tensors,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP16:
+      SplitTensorsForAllReduce<platform::MLUDeviceContext, platform::float16>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    case framework::proto::VarType::FP32:
+      SplitTensorsForAllReduce<platform::MLUDeviceContext, float>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+#endif
+
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
@@ -253,6 +306,16 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Paddle can't concat npu grads since it's not compiled with HCCL,"
         "Please recompile or reinstall Paddle with HCCL support."));
+#endif
+  } else if (platform::is_mlu_place(place)) {
+#ifdef PADDLE_WITH_CNCL
+    ConcatTensorsWithType(
+        static_cast<const platform::MLUDeviceContext &>(context),
+        dense_tensors_, &dense_contents_, dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat mlu grads since it's not compiled with CNCL,"
+        "Please recompile or reinstall Paddle with CNCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
     ConcatTensorsWithType(
@@ -295,6 +358,16 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Paddle can't split npu grad since it's not compiled with HCCL,"
         "Please recompile or reinstall Paddle with HCCL support."));
+#endif
+  } else if (platform::is_mlu_place(place)) {
+#ifdef PADDLE_WITH_CNCL
+    SplitTensorsWithType(
+        static_cast<const platform::MLUDeviceContext &>(context),
+        &dense_contents_, &dense_tensors_, dtype_);
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split mlu grad since it's not compiled with CNCL,"
+        "Please recompile or reinstall Paddle with CNCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
     SplitTensorsWithType(
@@ -746,6 +819,11 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
         // TODO(liuyuhui) support XPU set constant
         VLOG(3) << "XPU doesn't support set_constant";
       }
+#elif defined(PADDLE_WITH_CNCL)
+      if (platform::is_mlu_place(group_tensor.place())) {
+        // TODO(liuyuhui) support MLU set constant
+        VLOG(3) << "MLU doesn't support set_constant";
+      }
 #else
       auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
       if (HasGrad(var_index)) {
@@ -846,12 +924,13 @@ void Reducer::MarkGroupReady(size_t group_index) {
         cv_.notify_all();
       }
     });
-#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
+#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) ||    \
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
+    defined(PADDLE_WITH_CNCL)
     FusedAllReduceSchedule(run_order, group, next_group_);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Not compiled with BKCL or NCCL or GLOO."));
+        "Not compiled with BKCL or NCCL or CNCL or GLOO."));
 #endif
   }
 }
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index cca773b840c279..9fac4b41cbde01 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -45,7 +45,7 @@ namespace imperative {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
 
 template <typename T>
 struct DivNRanksFunctor {
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index e4f1cfdb3baeed..09de0106ed6190 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -21,6 +21,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s
 cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
 cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy)
 cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op)
-if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL)
+if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_CNCL)
 cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy)
 endif()
diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc
index 6c304278d21fde..5e674af1a08a87 100644
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@@ -72,8 +72,10 @@ void GroupConcatSplit(Place place, size_t size) {
       value.push_back(static_cast<T>(1.0 * j));
     }
 
-    if (std::is_same<Place, platform::CUDAPlace>::value) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    if (std::is_same<Place, platform::CUDAPlace>::value ||
+        std::is_same<Place, platform::MLUPlace>::value) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_CNCL)
       paddle::memory::Copy(place, data, cpu_place, value.data(),
                            sizeof(T) * value.size(), 0);
 #endif
@@ -180,5 +182,19 @@ TEST(TestGroup, TestXPUConcatSplit) {
 }
 #endif
 
+#if defined(PADDLE_WITH_CNCL)
+TEST(TestGroup, TestMLUConcatSplit) {
+  platform::MLUPlace mlu_place(0);
+  platform::CPUPlace cpu_place;
+
+  int size = 3;
+  GroupConcatSplit<float>(cpu_place, size);
+  GroupConcatSplit<float>(mlu_place, size);
+
+  size = 15;
+  GroupConcatSplit<float>(cpu_place, size);
+  GroupConcatSplit<float>(mlu_place, size);
+}
+#endif
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index f5ca13cb99ad3d..17cbe06748234a 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -226,7 +226,7 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
 }  // namespace paddle
 
 USE_OP_ITSELF(split);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 #ifdef PADDLE_WITH_MKLDNN
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 #endif
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 4336a5c77c178f..01c9d2847e0c85 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -18,12 +18,14 @@
 #include <utility>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
+#include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/platform/denormal.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/common/place.h"
 
 DECLARE_bool(use_mkldnn);
 DECLARE_string(tracer_mkldnn_ops_on);
@@ -382,5 +384,36 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins,
   return false;
 }
 
+phi::KernelSignature Tracer::GetExpectedKernelSignature(
+    const std::string& type, const NameVarBaseMap& ins,
+    const NameVarBaseMap& outs, framework::AttributeMap attrs) const {
+  auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
+  framework::RuntimeContext ctx({}, {});
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(phi::CPUPlace());
+  const auto& op_info = op->Info();
+  auto* attr_checker = op_info.Checker();
+  if (attr_checker) {
+    attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
+  }
+  static paddle::framework::AttributeMap empty_attrs_map = {};
+  const paddle::framework::AttributeMap& default_attrs =
+      attr_checker == nullptr ? empty_attrs_map
+                              : attr_checker->GetDefaultAttrMap();
+  auto dygraph_exe_ctx =
+      imperative::DygraphExecutionContext<imperative::VarBase>(
+          *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs,
+          default_attrs);
+  auto* opbase_with_kernel =
+      dynamic_cast<framework::OperatorWithKernel*>(op.get());
+  PADDLE_ENFORCE_NE(opbase_with_kernel, nullptr,
+                    platform::errors::InvalidArgument(
+                        "This op type:`%s` is not a OperatorWithKernel, only "
+                        "OperatorWithKernel can get KernelSignature",
+                        type));
+  return phi::KernelSignature(
+      std::move(opbase_with_kernel->GetExpectedPhiKernelArgs(dygraph_exe_ctx)));
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 73ecbbe6143ca8..fd13fce6a6e17a 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace paddle {
 namespace imperative {
@@ -154,6 +155,10 @@ class Tracer {
     }
   }
 
+  phi::KernelSignature GetExpectedKernelSignature(
+      const std::string& type, const NameVarBaseMap& ins,
+      const NameVarBaseMap& outs, framework::AttributeMap attrs) const;
+
   paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
       const platform::Place& place);
 
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 6eeb5d64253597..1f83e606c3fded 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 
 set(paddle_inference_api_deps lod_tensor scope reset_tensor_array
-    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator phi_custom_kernel)
+    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
 
 if(WITH_CRYPTO)
     list(APPEND paddle_inference_api_deps paddle_crypto)
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index 8c61200f7f57cd..b69292827aa136 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -89,5 +89,5 @@ class DropoutOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index f2dc5ba1c7c2c8..7f7313fbcb5969 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -52,7 +52,7 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP(sigmoid);
 USE_OP(tanh);
 USE_OP(relu6);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
index 474fd92071fb07..cf377396087637 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
@@ -57,4 +57,4 @@ TEST(DropoutOpConverter, main) {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
index 57177cfa8b421e..336005d883b0f5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
@@ -16,7 +16,6 @@
 #include <cassert>
 
 #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 91a0352e1915e9..e77be832c0cc89 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -161,7 +161,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling segment_pooling executor device_memory_aligment generator)
+sequence_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve)
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index c28026a4bd43aa..e1460629fb18a4 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -141,8 +141,8 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index 0ac29e6d3ada73..b4a97e24cf2923 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -132,7 +132,9 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -146,7 +148,9 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
       : CudnnActivationGradFunctor<T>(ctx, 6.0,
                                       GPUDNN_ACTIVATION_CLIPPED_RELU) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -159,7 +163,9 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -172,7 +178,9 @@ struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename Functor>
@@ -197,7 +205,8 @@ class CudnnActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
-    static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out.");
+    static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut,
+                  "Forward deps must be Out.");
 
     const framework::Tensor *X, *Out, *dOut;
     X = Out = dOut = nullptr;
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 73d65b7c6e7e0a..66f1bcc8b68692 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -34,7 +34,8 @@ using paddle::framework::Tensor;
 
 template <typename GradFunctor>
 static constexpr bool CanInplaceAct() {
-  return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps;
+  return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut ||
+         GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps;
 }
 
 #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                    \
@@ -921,7 +922,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DX")) {
         ctx->ShareDim("X", "DX");
         ctx->ShareLoD("X", "DX");
@@ -931,7 +933,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("DOut")) {
         ctx->ShareDim("Out", "DOut");
         ctx->ShareLoD("Out", "DOut");
@@ -960,13 +963,15 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DDOut")) {
         ctx->ShareDim("X", "DDOut");
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("DDOut")) {
         ctx->ShareDim("Out", "DDOut");
         ctx->ShareLoD("Out", "DDOut");
@@ -987,7 +992,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       if (ctx->HasOutput("DX")) {
         ctx->ShareDim("X", "DX");
         ctx->ShareLoD("X", "DX");
@@ -997,7 +1003,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel {
         ctx->ShareLoD("X", "DDOut");
       }
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       if (ctx->HasOutput("D_DOut")) {
         ctx->ShareDim("Out", "D_DOut");
         ctx->ShareLoD("Out", "D_DOut");
@@ -1464,6 +1471,18 @@ namespace plat = paddle::platform;
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
 
+REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor)
+REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor);
+REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor);
+REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor);
+REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor);
+REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor);
+REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor);
+REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor);
+REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);
+REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);
+REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
+
 /* ==========================    sigmoid register  =============================
  */
 // 1. Register Sigmoid Operator
@@ -1584,16 +1603,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad2<ops::ReluGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluCPUFunctor, ReluGradFunctor);
-
-REGISTER_OP_CPU_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ======================== leaky relu register  ============================ */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index ff41da86f7bb6b..4b79397b6cdf2e 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -35,16 +35,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
 namespace paddle {
 namespace operators {
 
 using framework::To32BitIndex;
 
-enum ActBwdOpFwdDeps {
-  kNoDeps = 0x00,  // Do not need any forward input/output
-  kDepX = 0x01,    // Only need forward input X
-  kDepOut = 0x02,  // Only need forward output Out
-};
+using ActBwdOpFwdDeps = phi::funcs::ActBwdOpFwdDeps;
 
 /* The following operator can be used to process SelectedRows, because the
  * output of those operator for zero is zero too.
@@ -89,7 +87,8 @@ inline void ExtractActivationGradTensor(
   auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
   const framework::Variable* out_var = nullptr;
 
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+  if (static_cast<int>(kDepValue) &
+      static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
     out_var = context.InputVar("Out");
     PADDLE_ENFORCE_NOT_NULL(
         out_var, platform::errors::NotFound(
@@ -139,7 +138,7 @@ inline void ExtractActivationGradTensor(
                               "Output(Out), variable name = %s",
                               context.OutputName(framework::GradVarName("X"))));
 
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+  if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
     auto x_var = context.InputVar("X");
     PADDLE_ENFORCE_NOT_NULL(x_var, platform::errors::NotFound(
                                        "Cannot get the tensor from the "
@@ -248,6 +247,24 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+#define USE_PHI_FUNCTOR(name)                         \
+  template <typename T>                               \
+  using name##Functor = phi::funcs::name##Functor<T>; \
+  template <typename T>                               \
+  using name##GradFunctor = phi::funcs::name##GradFunctor<T>;
+
+USE_PHI_FUNCTOR(Cos)
+USE_PHI_FUNCTOR(Tan)
+USE_PHI_FUNCTOR(Acos)
+USE_PHI_FUNCTOR(Sin)
+USE_PHI_FUNCTOR(Asin)
+USE_PHI_FUNCTOR(Atan)
+USE_PHI_FUNCTOR(Sinh)
+USE_PHI_FUNCTOR(Cosh)
+USE_PHI_FUNCTOR(Asinh)
+USE_PHI_FUNCTOR(Acosh)
+USE_PHI_FUNCTOR(Atanh)
+
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
@@ -256,7 +273,9 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out * (static_cast<T>(1) - out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 /*
@@ -293,7 +312,9 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 /*
@@ -351,7 +372,9 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
           (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // silu(x) = x / (1 + exp(-x))
@@ -376,7 +399,7 @@ struct SiluGradFunctor : public BaseActivationFunctor<T> {
                            (static_cast<T>(1) + (temp2 / temp1)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
@@ -414,7 +437,7 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
         dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // exp(x) = e^x
@@ -434,7 +457,9 @@ struct ExpGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // expm1(x) = e^x - 1
@@ -454,38 +479,23 @@ struct Expm1GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * out + dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // relu(x) = max(x, 0)
-template <typename T>
-struct ReluCPUFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) {
-      return v > static_cast<T>(0) ? v : static_cast<T>(0);
-    });
-  }
-};
 
 template <typename T>
-struct ReluCUDAFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.cwiseMax(static_cast<T>(0));
-  }
-};
+using ReluCPUFunctor = phi::funcs::ReluCPUFunctor<T>;
+template <typename T>
+using ReluGradFunctor = phi::funcs::ReluGradFunctor<T>;
 
 template <typename T>
-struct ReluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
-  }
+using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
+template <typename T>
+using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
 
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
@@ -504,7 +514,9 @@ struct TanhGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) - out * out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -534,7 +546,9 @@ struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 /*
     Out
@@ -589,7 +603,9 @@ struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
                          static_cast<T>(2) * out * dout * d_dOutNew;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // tanhshrink(x) = x - tanh(x)
@@ -610,7 +626,7 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (x.tanh() * x.tanh());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // tanhshrink(x) = x - tanh(x)
@@ -646,7 +662,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (temp1 || temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
@@ -682,7 +698,7 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // sqrt(x) = x^(1/2)
@@ -702,7 +718,9 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(0.5) * dout / out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // rsqrt(x) = x^(-1/2)
@@ -722,7 +740,9 @@ struct RsqrtGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(-0.5) * dout * out * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // ceil(x) = ceiling(x)
@@ -742,7 +762,9 @@ struct ZeroGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = static_cast<T>(0) * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kNoDeps;
+  }
 };
 
 // floor(x) = flooring(x)
@@ -754,373 +776,6 @@ struct FloorFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct Sine {
-  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
-};
-
-template <>
-struct Sine<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(sin(static_cast<float>(val)));
-  }
-};
-
-template <typename T>
-struct Cosine {
-  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
-};
-
-template <>
-struct Cosine<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(cos(static_cast<float>(val)));
-  }
-};
-
-// cosine'(x) = -sin(x)
-template <typename T>
-struct CosGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// cosine(x) = cos(x)
-template <typename T>
-struct CosFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Cosine<T>());
-  }
-};
-
-// sine'(x) = cos(x)
-template <typename T>
-struct SinGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// sine(x) = sin(x)
-template <typename T>
-struct SinFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Sine<T>());
-  }
-};
-
-template <typename T>
-struct Tangent {
-  HOSTDEVICE T operator()(const T& val) const { return tan(val); }
-};
-
-template <>
-struct Tangent<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(tan(static_cast<float>(val)));
-  }
-};
-
-// Tangent'(x) = -Tangent(x)
-template <typename T>
-struct TanGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout / x.unaryExpr(Cosine<T>()).square();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// Tangent(x) = tan(x)
-template <typename T>
-struct TanFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Tangent<T>());
-  }
-};
-
-template <typename T>
-struct Sinh {
-  HOSTDEVICE T operator()(const T& val) const { return sinh(val); }
-};
-
-template <>
-struct Sinh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(sinhf(static_cast<float>(val)));
-  }
-};
-
-template <typename T>
-struct Cosh {
-  HOSTDEVICE T operator()(const T& val) const { return cosh(val); }
-};
-
-template <>
-struct Cosh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(coshf(static_cast<float>(val)));
-  }
-};
-
-// sinh(x) = sinh(x)
-template <typename T>
-struct SinhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Sinh<T>());
-  }
-};
-
-// cosh(x) = cosh(x)
-template <typename T>
-struct CoshFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Cosh<T>());
-  }
-};
-
-// sinh'(x) = cosh(x)
-template <typename T>
-struct SinhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Cosh<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-// cosh'(x) = sinh(x)
-template <typename T>
-struct CoshGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * x.unaryExpr(Sinh<T>());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Acos {
-  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
-};
-
-template <>
-struct Acos<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(acos(static_cast<float>(val)));
-  }
-};
-
-// Acos(x) = acos(x)
-template <typename T>
-struct AcosFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Acos<T>());
-  }
-};
-
-// acos'(x) = -1/sqrt(1-x^2)
-template <typename T>
-struct AcosGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Asin {
-  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
-};
-
-template <>
-struct Asin<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(asin(static_cast<float>(val)));
-  }
-};
-
-// Asin(x) = asin(x)
-template <typename T>
-struct AsinFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Asin<T>());
-  }
-};
-
-// asin'(x) = 1/sqrt(1-x^2)
-template <typename T>
-struct AsinGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Atan {
-  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
-};
-
-template <>
-struct Atan<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(atan(static_cast<float>(val)));
-  }
-};
-
-// Atan(x) = atan(x)
-template <typename T>
-struct AtanFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Atan<T>());
-  }
-};
-
-// atan'(x) =  1 / (1 + x^2)
-template <typename T>
-struct AtanGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Acosh {
-  HOSTDEVICE T operator()(const T& val) const { return acosh(val); }
-};
-
-template <>
-struct Acosh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(acosh(static_cast<float>(val)));
-  }
-};
-
-// Acosh(x) = acosh(x)
-template <typename T>
-struct AcoshFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Acosh<T>());
-  }
-};
-
-// acosh'(x) =  1/sqrt(x^2 - 1)
-template <typename T>
-struct AcoshGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (x * x - static_cast<T>(1)).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Asinh {
-  HOSTDEVICE T operator()(const T& val) const { return asinh(val); }
-};
-
-template <>
-struct Asinh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(asinh(static_cast<float>(val)));
-  }
-};
-
-// Asinh(x) = asinh(x)
-template <typename T>
-struct AsinhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Asinh<T>());
-  }
-};
-
-// asinh'(x) =  1/sqrt(x^2 + 1)
-template <typename T>
-struct AsinhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) =
-        dout * static_cast<T>(1) / (x.square() + static_cast<T>(1)).sqrt();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct Atanh {
-  HOSTDEVICE T operator()(const T& val) const { return atanh(val); }
-};
-
-template <>
-struct Atanh<platform::float16> {
-  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
-    return platform::float16(atanh(static_cast<float>(val)));
-  }
-};
-
-// Atanh(x) = atanh(x)
-template <typename T>
-struct AtanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.unaryExpr(Atanh<T>());
-  }
-};
-
-// atanh'(x) =  1/(1 - x^2)
-template <typename T>
-struct AtanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) - x.square());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
@@ -1147,7 +802,9 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(-1) * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // log(x) = natural logarithm of x
@@ -1167,7 +824,7 @@ struct LogGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) / x);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log2(x) = logarithm to the base 2 of the elements of x
@@ -1188,7 +845,7 @@ struct Log2GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log10(x) = logarithm to the base 10 of the elements of x
@@ -1209,7 +866,7 @@ struct Log10GradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // log1p(x) = natural logarithm of x+1
@@ -1229,7 +886,7 @@ struct Log1pGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // square(x) = x^2
@@ -1249,7 +906,7 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(2) * x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1285,7 +942,7 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
                        .template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // relu6(x) = min(max(0, x), 6)
@@ -1319,7 +976,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
             .template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 // HardSwish = min(max(0, x+3), 6) * x / 6
@@ -1364,7 +1023,7 @@ struct HardSwishGradFunctor : public BaseActivationFunctor<T> {
          static_cast<T>(1) * (static_cast<T>(1) - tmp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // For numerical stability, using the following formula instead of softplus(x) =
@@ -1409,7 +1068,7 @@ struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
             .select(dout, dout / (static_cast<T>(1) + (-x_beta).exp()));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // mish(x) = x * tanh(softplus(x))
@@ -1449,7 +1108,7 @@ struct MishGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (tsp + x * (static_cast<T>(1) - tsp * tsp) * gsp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // softsign(x) = x / (1 + |x|)
@@ -1472,7 +1131,7 @@ struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
         dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1504,7 +1163,9 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1539,7 +1200,7 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1573,7 +1234,7 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
                        .select(dout, dout * (out + static_cast<T>(alpha)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1592,7 +1253,7 @@ struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
                        .select(dout, dout * static_cast<T>(alpha) * x.exp());
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename T>
@@ -1672,7 +1333,7 @@ struct CELUGradFunctor : public BaseActivationFunctor<T> {
         dout * (x / static_cast<T>(alpha)).exp() * temp_a_neg * temp_x_neg;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198
@@ -1701,7 +1362,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
                    x.pow(static_cast<T>(factor) - static_cast<T>(1));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1766,7 +1427,7 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1797,7 +1458,7 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * (x > th).template cast<T>();
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1832,7 +1493,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
                    static_cast<T>(slope);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1865,7 +1528,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 /*
@@ -1902,7 +1565,7 @@ inline void ExtractActivationDoubleGradTensor(
           "Cannot get the tensor from the Variable Output, variable name = %s",
           ctx.OutputName("DDX")));
 
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
+  if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
     auto x_var = ctx.InputVar("X");
     PADDLE_ENFORCE_NOT_NULL(
         x_var, platform::errors::NotFound(
@@ -1925,7 +1588,8 @@ inline void ExtractActivationDoubleGradTensor(
     VLOG(10) << "Inplace activation of Op: " << ctx.Type();
     *X = *ddX;
   }
-  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+  if (static_cast<int>(kDepValue) &
+      static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
     auto out_var = ctx.InputVar("Out");
     PADDLE_ENFORCE_NOT_NULL(
         out_var,
@@ -2000,28 +1664,7 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * x.sign();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* Out, const framework::Tensor* ddX,
-                  framework::Tensor* ddOut, framework::Tensor* dOut,
-                  framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad"));
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad"));
-      ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2050,7 +1693,7 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
               .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2088,7 +1731,7 @@ struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
                              .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2127,7 +1770,7 @@ struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
                              .template cast<T>();
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -2156,7 +1799,9 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(0.5) / out;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -2185,7 +1830,9 @@ struct RsqrtGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(-0.5) * out * out * out;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -2214,7 +1861,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
       ddout.device(*d) = ddx * static_cast<T>(2) * x;
     }
   }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need
@@ -2840,7 +2487,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
     }
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 }  // namespace operators
@@ -2849,20 +2496,9 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 #define FOR_EACH_ACTIVATION_OP(__macro)                                       \
   __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
   __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
   __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
   __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
   __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                       \
-  __macro(cos, Cos, CosFunctor, CosGradFunctor);                              \
-  __macro(tan, Tan, TanFunctor, TanGradFunctor);                              \
-  __macro(acos, Acos, AcosFunctor, AcosGradFunctor);                          \
-  __macro(sin, Sin, SinFunctor, SinGradFunctor);                              \
-  __macro(asin, Asin, AsinFunctor, AsinGradFunctor);                          \
-  __macro(sinh, Sinh, SinhFunctor, SinhGradFunctor);                          \
-  __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor);                          \
-  __macro(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);                      \
-  __macro(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);                      \
-  __macro(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);                      \
   __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
   __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
   __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index e1afb3919f813b..92a101451e211f 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -18,28 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct CudaReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // relu(x) = max(x, 0)
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > zero ? x : zero;
-  }
-};
-
-template <typename T>
-struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // dx = dout * (out > 0)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return out > zero ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
 template <typename T>
 struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
@@ -69,7 +47,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
     return x > zero ? dout : static_cast<T>(alpha) * dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -93,7 +71,9 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return dout * out * (one - out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -122,7 +102,7 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp * (one + x * (one - temp))));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -159,30 +139,7 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAtanFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // atan(x) = atan(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atan(x));
-  }
-};
-
-template <typename T>
-struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout / (1 + x^2)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (one + x * x);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -219,7 +176,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
     return (x >= -l && x <= l) ? zero : dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -262,191 +219,9 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(0.0f);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
-};
-
-template <typename T>
-struct CudaCosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // cos(x) = cos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cos(x));
-  }
-};
-
-template <typename T>
-struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * (-sin(x))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout * sin(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaSinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // sin(x) = sin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sin(x));
-  }
-};
-
-template <typename T>
-struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * cos(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cos(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaTanFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // tan(x) = tan(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(tan(x));
-  }
-};
-
-template <typename T>
-struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout / cos(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / (cos(x) * cos(x)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAsinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // asin(x) = asin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asin(x));
-  }
-};
-
-template <typename T>
-struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / sqrt(one - x * x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAcosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // acos(x) = acos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acos(x));
-  }
-};
-
-template <typename T>
-struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = -dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout / sqrt(one - x * x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaCoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // cosh(x) = cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cosh(x));
-  }
-};
-
-template <typename T>
-struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * sinh(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaSinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // sinh(x) = sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sinh(x));
-  }
-};
-
-template <typename T>
-struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cosh(x));
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kNoDeps;
   }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -469,86 +244,9 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
     return dout * (one - out * out);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Acosh(x) = acosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acosh(x));
-  }
-};
-
-template <typename T>
-struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1 / sqrt(x^2 - 1)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x - one));
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
   }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Asinh(x) = asinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asinh(x));
-  }
-};
-
-template <typename T>
-struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout * 1/sqrt(x^2 + 1)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x + one));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // Atanh(x) = atanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atanh(x));
-  }
-};
-
-template <typename T>
-struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1/(1- x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / (one - x * x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -566,7 +264,9 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
     return -dout * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -587,7 +287,9 @@ struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
     return dout * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -608,7 +310,9 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
     return dout * out + dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -629,7 +333,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
     return dout / x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -647,7 +351,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
     return dout * two * x;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -670,7 +374,9 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
     return one_half * dout / out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -693,7 +399,9 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
     return minus_one_half * dout * out * out * out;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -717,7 +425,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
     return dout / (one + x);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -741,7 +449,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
     return dout / (x * log_two);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -765,7 +473,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
     return dout / (x * log_ten);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -804,7 +512,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
     return (x > t_min_cast && x < t_max_cast) ? dout : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -849,7 +557,9 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
                                  : static_cast<T>(0.0f);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -893,7 +603,7 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * a * b * (one - temp * temp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -939,7 +649,7 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
     return x_beta > t ? arg_dout : static_cast<T>(dout / (one + exp(-x_beta)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -962,7 +672,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
     return dout / (temp * temp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -996,7 +706,9 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
     return (out > zero && out < t) ? dout : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1022,7 +734,7 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * tanh(x) * tanh(x));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1056,7 +768,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
     return (x > -t && x < t) ? zero : dout;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1097,7 +809,9 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
     return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1141,7 +855,7 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (temp2 + temp3));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1190,7 +904,7 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (tsp + x * (one - tsp * tsp) * gsp));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1222,7 +936,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
     return x > static_cast<T>(threshold) ? dout : zero;
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1274,7 +988,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
     return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1320,7 +1034,9 @@ struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
 };
 
 template <typename T>
@@ -1347,7 +1063,7 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
     return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename T>
@@ -1429,7 +1145,7 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor<T> {
          temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg));
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename DeviceContext, typename Functor>
@@ -1477,13 +1193,14 @@ class ActivationGradCudaKernel
     std::vector<const framework::Tensor*> ins = {d_out};
     std::vector<framework::Tensor*> outs = {d_x};
 
-    if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
+    if (static_cast<int>(Functor::FwdDeps()) ==
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
       // Only need forward output Out
       ins.push_back(out);
       paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
                                                                 &outs, functor);
     } else if (static_cast<int>(Functor::FwdDeps()) ==
-               static_cast<int>(kDepX)) {
+               static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
       // Only need forward input X
       ins.push_back(x);
       paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
@@ -1509,7 +1226,9 @@ namespace plat = paddle::platform;
       ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
                                 ops::functor<double>>,                         \
       ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
-                                ops::functor<plat::float16>>);                 \
+                                ops::functor<plat::float16>>,                  \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::bfloat16>>);                \
   REGISTER_OP_CUDA_KERNEL(                                                     \
       act_type##_grad,                                                         \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
@@ -1517,7 +1236,9 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<double>>,                \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
-                                    ops::grad_functor<plat::float16>>);
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::bfloat16>>);
 
 #define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor,        \
                                             grad_functor)                      \
@@ -1531,7 +1252,9 @@ namespace plat = paddle::platform;
       ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
                                 ops::functor<int64_t>>,                        \
       ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
-                                ops::functor<plat::float16>>);                 \
+                                ops::functor<plat::float16>>,                  \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::bfloat16>>);                \
   REGISTER_OP_CUDA_KERNEL(                                                     \
       act_type##_grad,                                                         \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
@@ -1543,7 +1266,9 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<int64_t>>,               \
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
-                                    ops::grad_functor<plat::float16>>);
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::bfloat16>>);
 
 /* ======================== leaky relu register  ============================ */
 REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
@@ -1594,50 +1319,6 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::CELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ===========================    relu register  ============================ */
-#ifdef PADDLE_WITH_HIP
-REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor,
-                                CudaReluGradFunctor);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    relu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                                    ops::CudaReluFunctor<float>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaReluFunctor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaReluFunctor<plat::float16>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaReluFunctor<plat::bfloat16>>);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                             ops::CudaReluGradFunctor<float>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<double>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<plat::float16>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
-                                  ops::CudaReluGradFunctor<plat::bfloat16>>);
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad_grad,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::float16>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::ReluGradGradFunctor<plat::bfloat16>>);
-#endif
-/* ========================================================================== */
-
 /* ===========================    sigmoid register  ============================
  */
 REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
@@ -1650,7 +1331,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                  ops::SigmoidGradGradFunctor<double>>,
     ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>);
+                                 ops::SigmoidGradGradFunctor<plat::float16>>,
+    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
+                                 ops::SigmoidGradGradFunctor<plat::bfloat16>>);
 
 REGISTER_OP_CUDA_KERNEL(
     sigmoid_triple_grad,
@@ -1659,7 +1342,10 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
                                  ops::SigmoidTripleGradFunctor<double>>,
     ops::SigmoidTripleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>);
+                                 ops::SigmoidTripleGradFunctor<plat::float16>>,
+    ops::SigmoidTripleGradKernel<
+        plat::CUDADeviceContext,
+        ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
 /* ========================================================================== */
 
 /* ===========================    tanh register  ============================ */
@@ -1696,7 +1382,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
                               ops::SqrtGradGradFunctor<double>>,
     ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<plat::float16>>);
+                              ops::SqrtGradGradFunctor<plat::float16>>,
+    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::SqrtGradGradFunctor<plat::bfloat16>>);
 /* ========================================================================== */
 
 /* ===========================   rsqrt register  =============================
@@ -1726,6 +1414,8 @@ REGISTER_OP_CUDA_KERNEL(
                                 ops::SquareGradGradFunctor<double>>,
     ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
                                 ops::SquareGradGradFunctor<plat::float16>>,
+    ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
+                                ops::SquareGradGradFunctor<plat::bfloat16>>,
     ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                 ops::SquareGradGradFunctor<int>>,
     ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
@@ -1821,21 +1511,10 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
   __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
           CudaLogSigmoidGradFunctor);                                         \
-  __macro(atan, Atan, CudaAtanFunctor, CudaAtanGradFunctor);                  \
   __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
           CudaSoftShrinkGradFunctor);                                         \
   __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor);                  \
   __macro(floor, Floor, CudaFloorFunctor, CudaZeroGradFunctor);               \
-  __macro(cos, Cos, CudaCosFunctor, CudaCosGradFunctor);                      \
-  __macro(tan, Tan, CudaTanFunctor, CudaTanGradFunctor);                      \
-  __macro(acos, Acos, CudaAcosFunctor, CudaAcosGradFunctor);                  \
-  __macro(sin, Sin, CudaSinFunctor, CudaSinGradFunctor);                      \
-  __macro(asin, Asin, CudaAsinFunctor, CudaAsinGradFunctor);                  \
-  __macro(sinh, Sinh, CudaSinhFunctor, CudaSinhGradFunctor);                  \
-  __macro(cosh, Cosh, CudaCoshFunctor, CudaCoshGradFunctor);                  \
-  __macro(asinh, Asinh, CudaAsinhFunctor, CudaAsinhGradFunctor);              \
-  __macro(acosh, Acosh, CudaAcoshFunctor, CudaAcoshGradFunctor);              \
-  __macro(atanh, Atanh, CudaAtanhFunctor, CudaAtanhGradFunctor);              \
   __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor);               \
   __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
           CudaReciprocalGradFunctor);                                         \
@@ -1874,8 +1553,6 @@ FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
 
 REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
                                CudaLeakyReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(relu, Relu, CudaReluFunctor,
-                               CudaReluGradFunctor);
 REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
                                CudaSigmoidGradFunctor);
 REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor);
diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index de4d7818020dd5..716a2e40179e40 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -147,8 +147,8 @@ class AddMMOpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor,
-                            PT_INFER_META(phi::AddmmInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor,
+                            PD_INFER_META(phi::AddmmInferMeta));
 REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker,
                   ops::AddMMOpGradMaker<paddle::framework::OpDesc>,
                   ops::AddMMOpGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/amp/fp16_type_traits.h b/paddle/fluid/operators/amp/fp16_type_traits.h
index f7aa0de97598df..56aebe90788fba 100644
--- a/paddle/fluid/operators/amp/fp16_type_traits.h
+++ b/paddle/fluid/operators/amp/fp16_type_traits.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -32,6 +33,12 @@ class MPTypeTrait<platform::float16> {
   using Type = float;
 };
 
+template <>
+class MPTypeTrait<platform::bfloat16> {
+ public:
+  using Type = float;
+};
+
 }  // namespace details
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index 0f5c048b6be9c7..c5e4188ca2d6f7 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -15,23 +15,19 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+DECLARE_INFER_SHAPE_FUNCTOR(arg_max, ArgMaxInferShapeFunctor,
+                            PD_INFER_META(phi::ArgMinMaxInferMeta));
+
 REGISTER_OPERATOR(
     arg_max, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMaxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ArgMaxInferShapeFunctor);
+
 REGISTER_OP_VERSION(arg_max)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
deleted file mode 100644
index 14708c4df10f51..00000000000000
--- a/paddle/fluid/operators/arg_max_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    arg_max, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMax>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMax>);
diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h
deleted file mode 100644
index b77031f7fb4c9d..00000000000000
--- a/paddle/fluid/operators/arg_min_max_op_base.cu.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include <limits>
-#include <string>
-#include <typeinfo>
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {  // NOLINT
-template <typename K, typename V>
-using KeyValuePair = cub::KeyValuePair<K, V>;
-using Tensor = framework::Tensor;
-
-}  // end namespace
-
-#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
-  case (1 << (log2_block_dim)): {                       \
-    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
-    __VA_ARGS__;                                        \
-  } break
-
-#define FIXED_BLOCK_DIM_CASE(...)               \
-  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
-
-template <typename T, typename IndType, class Reducer, size_t BlockDim>
-__global__ void ArgCUDAKernel(const int64_t height,     // n * h
-                              const int64_t width,      // c
-                              const int64_t post_size,  // h
-                              const Reducer reducer, const T init, const T* in,
-                              IndType* out) {
-  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / post_size;
-    int w = idx % post_size;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair =
-          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      out[idx] = static_cast<IndType>(kv_pair.key);
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, typename IndType, class Reducer>
-void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
-                    Tensor* indices, const int64_t pre, const int64_t post,
-                    const int64_t n) {
-  auto cu_stream = ctx.stream();
-  auto ComputeBlockSize = [](int64_t col) {
-    auto block_size = 8;
-    if (col > 512)
-      block_size = 1024;
-    else if (col > 256)
-      block_size = 512;
-    else if (col > 128)
-      block_size = 256;
-    else if (col > 64)
-      block_size = 128;
-    else if (col > 32)
-      block_size = 64;
-    else if (col > 16)
-      block_size = 32;
-    else if (col > 8)
-      block_size = 16;
-#ifdef __HIPCC__
-    block_size = std::min(block_size, 256);
-#endif
-    return block_size;
-  };
-
-  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t height = pre * post;
-  int64_t width = n;
-  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
-
-  const T* in_data = input.data<T>();
-  IndType* out_data = indices->mutable_data<IndType>(ctx.GetPlace());
-
-  if (typeid(Reducer) == typeid(cub::ArgMax)) {
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T, IndType, Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, Reducer(), std::numeric_limits<T>::lowest(),
-              in_data, out_data));
-    }
-  } else {
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgCUDAKernel<T, IndType, Reducer,
-                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, Reducer(), std::numeric_limits<T>::max(),
-              in_data, out_data));
-    }
-  }
-}
-
-template <typename T, class Reducer>
-struct VisitDataCudaArgMinMaxFunctor {
-  const framework::ExecutionContext& ctx;
-
-  explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {}
-  template <typename IndType>
-  void apply() const {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    int axis = ctx.Attr<int64_t>("axis");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-
-    framework::DDim input_dims;
-    if (flatten) {
-      input_dims = phi::make_ddim({input->numel()});
-      // if flatten, the axis just as 0
-      axis = 0;
-    } else {
-      input_dims = input->dims();
-      if (axis < 0) axis += input->dims().size();
-    }
-
-    int64_t numel = input->numel();
-    int64_t groups = numel / input_dims[axis];
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-
-    const auto& dev_ctx = ctx.cuda_device_context();
-    ComputeFullArg<T, IndType, Reducer>(dev_ctx, *input, output, pre, post, n);
-  }
-};
-template <typename T, class Reducer>
-class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dtype = ctx.Attr<int>("dtype");
-    if (dtype < 0) {
-      framework::VisitDataTypeTiny(
-          static_cast<framework::proto::VarType::Type>(
-              framework::proto::VarType::INT64),
-          VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
-      return;
-    }
-    framework::VisitDataTypeTiny(
-        static_cast<framework::proto::VarType::Type>(dtype),
-        VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
-  }
-};
-
-#endif
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index d3ce61d183a3d3..585341beea12c1 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -27,193 +27,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-enum ArgMinMaxType { kArgMin, kArgMax };
-
-template <typename DeviceContext, typename T, typename Tout, int64_t Rank,
-          ArgMinMaxType argMinMaxValue>
-struct ArgMinMaxFunctor {};
-
-#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)      \
-  template <typename DeviceContext, typename T, typename Tout, int64_t Rank>  \
-  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
-                          enum_argminmax_value> {                             \
-    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor* out, framework::DDim x_dims,        \
-                    int64_t axis, bool keepdims) {                            \
-      auto in_eigen = framework::EigenTensor<T, Rank>::From(in, x_dims);      \
-      if (keepdims) {                                                         \
-        auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out);      \
-        out_eigen.device(*(ctx.eigen_device())) =                             \
-            in_eigen.eigen_op_type(axis).template cast<Tout>();               \
-      } else {                                                                \
-        auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out);  \
-        out_eigen.device(*(ctx.eigen_device())) =                             \
-            in_eigen.eigen_op_type(axis).template cast<Tout>();               \
-      }                                                                       \
-    }                                                                         \
-  }
-
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
-DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
-
-template <typename DeviceContext, typename T, ArgMinMaxType EnumArgMinMaxValue>
-struct VisitDataArgMinMaxFunctor {
-  const framework::ExecutionContext& ctx;
-
-  explicit VisitDataArgMinMaxFunctor(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {}
-  template <typename Tout>
-  void apply() const {
-    auto& x = *(ctx.Input<framework::LoDTensor>("X"));
-    auto& out = *(ctx.Output<framework::LoDTensor>("Out"));
-    out.template mutable_data<Tout>(ctx.GetPlace());
-    auto axis = ctx.Attr<int64_t>("axis");
-    auto keepdims = ctx.Attr<bool>("keepdims");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-    // paddle do not have the scalar tensor, just return the shape [1] tensor
-    if (flatten) keepdims = true;
-
-    // if flatten, will construct the new dims for the cacluate
-    framework::DDim x_dims;
-    if (flatten) {
-      x_dims = phi::make_ddim({x.numel()});
-      // if flatten, the axis just as 0
-      axis = 0;
-    } else {
-      x_dims = x.dims();
-      if (axis < 0) axis += x_dims.size();
-    }
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-#define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
-  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
-      functor##rank;                                                 \
-  functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims)
-
-    switch (x_dims.size()) {
-      case 1:
-        CALL_ARG_MINMAX_FUNCTOR(1);
-        break;
-      case 2:
-        CALL_ARG_MINMAX_FUNCTOR(2);
-        break;
-      case 3:
-        CALL_ARG_MINMAX_FUNCTOR(3);
-        break;
-      case 4:
-        CALL_ARG_MINMAX_FUNCTOR(4);
-        break;
-      case 5:
-        CALL_ARG_MINMAX_FUNCTOR(5);
-        break;
-      case 6:
-        CALL_ARG_MINMAX_FUNCTOR(6);
-        break;
-      default:
-        PADDLE_ENFORCE_LE(
-            x_dims.size(), 6,
-            platform::errors::InvalidArgument(
-                "%s operator doesn't supports tensors whose ranks are greater "
-                "than 6.",
-                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
-        break;
-#undef CALL_ARG_MINMAX_FUNCTOR
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, ArgMinMaxType EnumArgMinMaxValue>
-class ArgMinMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dtype = ctx.Attr<int>("dtype");
-    if (dtype < 0) {
-      framework::VisitDataTypeTiny(
-          static_cast<framework::proto::VarType::Type>(
-              framework::proto::VarType::INT64),
-          VisitDataArgMinMaxFunctor<DeviceContext, T, EnumArgMinMaxValue>(ctx));
-      return;
-    }
-    framework::VisitDataTypeTiny(
-        static_cast<framework::proto::VarType::Type>(dtype),
-        VisitDataArgMinMaxFunctor<DeviceContext, T, EnumArgMinMaxValue>(ctx));
-  }
-};
-
-template <typename DeviceContext, typename T>
-using ArgMinKernel = ArgMinMaxKernel<DeviceContext, T, ArgMinMaxType::kArgMin>;
-
-template <typename DeviceContext, typename T>
-using ArgMaxKernel = ArgMinMaxKernel<DeviceContext, T, ArgMinMaxType::kArgMax>;
-
 class ArgMinMaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "arg_min_max");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "arg_min_max");
-    const auto& x_dims = ctx->GetInputDim("X");
-    int64_t axis = ctx->Attrs().Get<int64_t>("axis");
-    bool keepdims = ctx->Attrs().Get<bool>("keepdims");
-    const bool& flatten = ctx->Attrs().Get<bool>("flatten");
-
-    PADDLE_ENFORCE_GE(axis, -x_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "'axis'(%d) must be greater than or equal to"
-                          " -Rank(X)(%d).",
-                          axis, -x_dims.size()));
-    PADDLE_ENFORCE_LT(
-        axis, x_dims.size(),
-        platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis,
-            x_dims.size()));
-
-    const int& dtype = ctx->Attrs().Get<int>("dtype");
-    PADDLE_ENFORCE_EQ(
-        (dtype < 0 || dtype == 2 || dtype == 3), true,
-        platform::errors::InvalidArgument(
-            "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
-            "received [%s]",
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64),
-            paddle::framework::DataTypeToString(
-                static_cast<framework::proto::VarType::Type>(dtype))));
-
-    auto x_rank = x_dims.size();
-    if (axis < 0) axis += x_rank;
-    if (ctx->IsRuntime()) {
-      if (dtype == framework::proto::VarType::INT32) {
-        int64_t all_element_num = 0;
-        if (flatten) {
-          all_element_num = phi::product(x_dims);
-
-        } else {
-          all_element_num = x_dims[axis];
-        }
-        PADDLE_ENFORCE_LE(
-            all_element_num, INT_MAX,
-            platform::errors::InvalidArgument(
-                "The element num of the argmin/argmax input at axis is "
-                "%d, is larger than int32 maximum value:%d, you must "
-                "set the dtype of argmin/argmax to 'int64'.",
-                all_element_num, INT_MAX));
-      }
-    }
-    std::vector<int64_t> vec;
-    if (flatten) {
-      vec.emplace_back(static_cast<int64_t>(1));
-    } else {
-      for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
-      if (keepdims) {
-        vec.emplace_back(static_cast<int64_t>(1));
-      }
-      for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(vec));
-  }
 };
 
 class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index 0a4ba6fb0bfdfc..fb3abd01af8c39 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -12,26 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+DECLARE_INFER_SHAPE_FUNCTOR(arg_min, ArgMinInferShapeFunctor,
+                            PD_INFER_META(phi::ArgMinMaxInferMeta));
 
 REGISTER_OPERATOR(
     arg_min, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMinOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ArgMinInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(
-    arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int32_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    int16_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                    uint8_t>);
 REGISTER_OP_VERSION(arg_min)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
deleted file mode 100644
index 23170bf0087906..00000000000000
--- a/paddle/fluid/operators/arg_min_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
-REGISTER_OP_CUDA_KERNEL(
-    arg_min, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMin>,
-    paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMin>);
diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
index 9e525c20335d37..1a8aca777370bc 100644
--- a/paddle/fluid/operators/argsort_op.cc
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -12,40 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/argsort_op.h"
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
 class ArgsortOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "argsort");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "argsort");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "argsort");
-
-    auto in_dims = ctx->GetInputDim("X");
-    int axis = ctx->Attrs().Get<int>("axis");
-
-    auto num_dims = in_dims.size();
-    PADDLE_ENFORCE_GE(axis, -num_dims,
-                      platform::errors::InvalidArgument(
-                          "'axis'(%d) must be greater than or equal to"
-                          " -num_dims(%d).",
-                          axis, -num_dims));
-    PADDLE_ENFORCE_LT(
-        axis, num_dims,
-        platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than num_dims(%d).", axis, num_dims));
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareDim("X", "Indices");
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
 };
 
 class ArgsortGradOp : public framework::OperatorWithKernel {
@@ -122,18 +101,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ArgsortGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(argsort, ArgsortInferShapeFunctor,
+                            PD_INFER_META(phi::ArgsortInferMeta));
 REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker,
                   ops::ArgsortGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ArgsortGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ArgsortGradOpMaker<paddle::imperative::OpBase>,
+                  ArgsortInferShapeFunctor);
 REGISTER_OPERATOR(argsort_grad, ops::ArgsortGradOp,
                   ops::ArgsortGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(argsort,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, float>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, double>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, int>,
-                       ops::ArgsortKernel<paddle::platform::CPUPlace, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    argsort_grad, ops::ArgsortGradientKernel<paddle::platform::CPUPlace, float>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, double>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, int>,
-    ops::ArgsortGradientKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
deleted file mode 100644
index 8b7a0b3eadb16b..00000000000000
--- a/paddle/fluid/operators/argsort_op.cu
+++ /dev/null
@@ -1,430 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/argsort_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-#ifdef __HIPCC__
-namespace rocprim {
-namespace detail {
-template <>
-struct radix_key_codec_base<paddle::platform::float16>
-    : radix_key_codec_integral<paddle::platform::float16, uint16_t> {};
-}  // namespace detail
-}  // namespace rocprim
-#else
-// set cub base traits in order to handle float16
-namespace cub {
-template <>
-struct NumericTraits<paddle::platform::float16>
-    : BaseTraits<FLOATING_POINT, true, false, uint16_t,
-                 paddle::platform::float16> {};
-}  // namespace cub
-#endif
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// Iter for move to next row
-struct SegmentOffsetIter {
-  EIGEN_DEVICE_FUNC
-  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
-    return idx * num_cols_;
-  }
-
-  int num_cols_;
-};
-
-template <typename T>
-static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (T j = row_id; j < num_rows; j += gridDim.x) {
-    for (T i = col_id; i < num_cols; i += blockDim.x) {
-      indices[j * num_cols + i] = i;
-    }
-  }
-}
-
-template <typename T, typename IndType>
-static __global__ void FillFlattenGrad(const T* dO, const IndType* indices,
-                                       int64_t size, T* dX) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = index; i < size; i += stride) {
-    dX[indices[i]] = dO[i];
-  }
-}
-
-template <typename T, typename IndType>
-static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX,
-                                IndType num_rows, IndType num_cols) {
-  int col_id = threadIdx.x;
-  int row_id = blockIdx.x;
-
-  for (IndType j = row_id; j < num_rows; j += gridDim.x) {
-    for (IndType i = col_id; i < num_cols; i += blockDim.x) {
-      dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i];
-    }
-  }
-}
-
-// Sort by flag descending, True: descending. False: Ascending.
-// Default is false.
-template <typename T, typename IndType>
-void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
-                 Tensor* output, Tensor* indices, const IndType num_rows,
-                 const IndType num_cols, const bool descending) {
-  auto cu_stream = ctx.stream();
-
-  Tensor input_indices;
-
-  const std::vector<IndType> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
-  input_indices.Resize(dim);
-  input_indices.mutable_data<IndType>(ctx.GetPlace());
-
-  size_t temp_storage_bytes = -1;
-
-  auto ComputeBlockSize = [](IndType col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  // actually, int num_rows < max_grid_size
-  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
-  // Init a index array
-  FillIndex<<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<IndType>(), num_rows, num_cols);
-
-  T* sorted_out_ptr;
-  IndType* sorted_indices_ptr;
-
-  const T* inp = input->data<T>();
-  T* out = output->mutable_data<T>(ctx.GetPlace());
-  IndType* ind = indices->mutable_data<IndType>(ctx.GetPlace());
-
-  sorted_out_ptr = out;
-  sorted_indices_ptr = ind;
-
-  // create iter for counting input
-  cub::CountingInputIterator<IndType> counting_iter(0);
-  // segment_offset is used for move to next row
-  cub::TransformInputIterator<IndType, SegmentOffsetIter,
-                              cub::CountingInputIterator<IndType>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-
-  gpuError_t err;
-  if (descending) {
-    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        nullptr, temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  } else {
-    err = cub::DeviceSegmentedRadixSort::SortPairs(
-        nullptr, temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(err);
-
-  Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  if (descending) {
-    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        temp_storage.data<uint8_t>(), temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  } else {
-    err = cub::DeviceSegmentedRadixSort::SortPairs(
-        temp_storage.data<uint8_t>(), temp_storage_bytes, inp, sorted_out_ptr,
-        input_indices.data<IndType>(), sorted_indices_ptr, num_cols * num_rows,
-        num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-        cu_stream);
-  }
-
-  PADDLE_ENFORCE_GPU_SUCCESS(err);
-}
-
-template <typename T, typename IndType>
-void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
-                   const Tensor* indices, Tensor* dX, const IndType num_rows,
-                   const IndType num_cols) {
-  auto cu_stream = ctx.stream();
-
-  auto ComputeBlockSize = [](IndType col) {
-    if (col > 512)
-      return 1024;
-    else if (col > 256 && col <= 512)
-      return 512;
-    else if (col > 128 && col <= 256)
-      return 256;
-    else if (col > 64 && col <= 128)
-      return 128;
-    else
-      return 64;
-  };
-
-  int block_size = ComputeBlockSize(num_cols);
-
-  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  // actually, int num_rows < max_grid_size
-  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
-  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(
-      dO->data<T>(), indices->data<IndType>(), dX->data<T>(), num_rows,
-      num_cols);
-}
-
-template <typename T>
-void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
-                      const Tensor* indices, int64_t size, Tensor* dX) {
-  auto cu_stream = ctx.stream();
-
-  const int64_t block_size =
-      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
-  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
-  const int64_t max_blocks =
-      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
-  const int64_t grid_size =
-      std::min(max_blocks, (size + block_size - 1) / block_size);
-
-  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
-      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
-}
-
-template <typename DeviceContext, typename T>
-class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    const T* in_data = input->data<T>();
-    auto size = input->numel();
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    // Use thrust for parallel acceleration when the input size is equal to the
-    // length of the ‘axis’ dimension.
-    // Compared to the following 'Special case for full sort', ascending sort is
-    // 34 times faster and descending sort is 31 times faster.
-    if (size == in_dims[axis]) {
-      thrust::sequence(thrust::device, ids_data, ids_data + size);
-      thrust::copy(thrust::device, in_data, in_data + size, out_data);
-      thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
-      if (descending) {
-        thrust::reverse(thrust::device, out_data, out_data + size);
-        thrust::reverse(thrust::device, ids_data, ids_data + size);
-      }
-      return;
-    }
-
-    // Special case for full sort, speedup ~190x.
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      ArgFullSort<T, int64_t>(dev_ctx, input, output, indices, input_height,
-                              input_width, descending);
-    } else {
-      // if not full sort, do transpose first
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      T* trans_inp_data = trans_inp.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      // Do transpose
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-      T* out_data = output->mutable_data<T>(ctx.GetPlace());
-
-      Tensor tmp_indices;
-      // temp indices for sorting
-      tmp_indices.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      indices->mutable_data<int64_t>(ctx.GetPlace());
-
-      ArgFullSort<T, int64_t>(dev_ctx, &trans_inp, &tmp_out, &tmp_indices,
-                              input_height, input_width, descending);
-
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, tmp_indices, indices, trans);
-      // transpose back
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, tmp_out,
-                                                   output, trans);
-      return;
-    }
-  }
-};
-
-template <typename T>
-class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    if (dO->numel() == 0) return;
-
-    auto in_dims = dX->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    int64_t size = dX->numel();
-    const auto& dev_ctx = ctx.cuda_device_context();
-
-    // Parallel acceleration when the input size is equal to the length of the
-    // ‘axis’ dimension.
-    // Compared to 'special case for full sort' below, the gradient calculation
-    // is 10 times faster.
-    if (size == in_dims[axis]) {
-      ArgFlattenAssign<T>(dev_ctx, dO, indices, size, dX);
-      return;
-    }
-
-    // Special case for full sort, speedup ~190x.
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      ArgFullAssign<T, int64_t>(dev_ctx, dO, indices, dX, input_height,
-                                input_width);
-    } else {
-      // if not full sort, do transpose first
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      // Do transpose
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *dO,
-                                                   &trans_dO, trans);
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, *indices, &trans_ind, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      ArgFullAssign<T, int64_t>(dev_ctx, &trans_dO, &trans_ind, &tmp_out,
-                                input_height, input_width);
-
-      // transpose back
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, tmp_out, dX,
-                                                   trans);
-      return;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    argsort,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           int>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           int64_t>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                           paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel<float>,
-    paddle::operators::ArgsortGradOpCUDAKernel<double>,
-    paddle::operators::ArgsortGradOpCUDAKernel<int>,
-    paddle::operators::ArgsortGradOpCUDAKernel<int64_t>,
-    paddle::operators::ArgsortGradOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h
deleted file mode 100644
index d850e51a4bf061..00000000000000
--- a/paddle/fluid/operators/argsort_op.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-using Tensor = framework::Tensor;
-
-template <typename T, typename Type>
-static void FullSort(Type input_height, Type input_width, int input_dim,
-                     const framework::Tensor* input, T* t_out, Type* t_indices,
-                     bool descending) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.push_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.push_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    std::sort(col_vec.begin(), col_vec.end(),
-              [&](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                if (descending)
-                  return l.first > r.first;
-                else
-                  return l.first < r.first;
-              });
-
-    for (Type j = 0; j < input_width; ++j) {
-      t_out[i * input_width + j] = col_vec[j].first;
-      t_indices[i * input_width + j] = col_vec[j].second;
-    }
-  }
-}
-
-template <typename T, typename Type>
-static void FullAssign(Type input_height, Type input_width, int input_dim,
-                       const framework::Tensor* input,
-                       const framework::Tensor* indices, T* t_out) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = EigenVector<T>::Flatten(*input);
-      auto e_indices = EigenVector<Type>::Flatten(*indices);
-      for (Type j = 0; j < input_width; ++j) {
-        t_out[i * input_width + e_indices(j)] = e_input(j);
-      }
-    } else {
-      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        t_out[i * input_width + e_indices(i, j)] = e_input(i, j);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ArgsortKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    T* out_data = output->mutable_data<T>(ctx.GetPlace());
-
-    // Do full sort
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-      FullSort<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                           out_data, ids_data, descending);
-    } else {
-      // If not full sort do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-      output->mutable_data<T>(ctx.GetPlace());
-
-      Tensor tmp_indices;
-
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-
-      FullSort<T, int64_t>(input_height, input_width, in_dims.size(),
-                           &trans_inp, t_out, t_ind, descending);
-
-      indices->mutable_data<int64_t>(ctx.GetPlace());
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_ctx, tmp_indices, indices, trans);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, tmp_out,
-                                                  output, trans);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ArgsortGradientKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-
-    // Do full assign
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      FullAssign<T, int64_t>(input_height, input_width, in_dims.size(), dO,
-                             indices, dX->data<T>());
-    } else {
-      // If not full assign do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, *dO,
-                                                  &trans_dO, trans);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_ctx, *indices, &trans_ind, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      FullAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                             &trans_dO, &trans_ind, t_out);
-
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_ctx, tmp_out, dX,
-                                                  trans);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
index 077be715bece0b..c927eec00bc8bf 100644
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc
index 18e81936a16c63..359b00fcf87ee1 100644
--- a/paddle/fluid/operators/argsort_op_xpu.cc
+++ b/paddle/fluid/operators/argsort_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index 72488a932d9c33..b452dea8536dd9 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc
index 71a895c244c54f..0783b30a8580db 100644
--- a/paddle/fluid/operators/atan2_op.cc
+++ b/paddle/fluid/operators/atan2_op.cc
@@ -105,8 +105,8 @@ class Atan2OpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor,
-                            PT_INFER_META(phi::Atan2InferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor,
+                            PD_INFER_META(phi::Atan2InferMeta));
 REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker,
                   ops::Atan2GradMaker<paddle::framework::OpDesc>,
                   ops::Atan2GradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc
index 55bb57466c7b5e..bc9076f4d7c368 100644
--- a/paddle/fluid/operators/bce_loss_op.cc
+++ b/paddle/fluid/operators/bce_loss_op.cc
@@ -138,8 +138,8 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor,
-                            PT_INFER_META(phi::BCELossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor,
+                            PD_INFER_META(phi::BCELossInferMeta));
 
 REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker,
                   ops::BCELossGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 4774c0a1dbc3b7..9f6a78ab7a55f3 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -90,12 +90,12 @@ class BilinearTensorProductGradOpMaker
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product,
+DECLARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product,
                             BilinearTensorProductInferShapeFunctor,
-                            PT_INFER_META(phi::BilinearTensorProductInferMeta));
-DELCARE_INFER_SHAPE_FUNCTOR(
+                            PD_INFER_META(phi::BilinearTensorProductInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(
     bilinear_tensor_product_grad, BilinearTensorProductGradInferShapeFunctor,
-    PT_INFER_META(phi::BilinearTensorProductGradInferMeta));
+    PD_INFER_META(phi::BilinearTensorProductGradInferMeta));
 
 REGISTER_OPERATOR(
     bilinear_tensor_product, ops::BilinearTensorProductOp,
diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc
index b37334a14bad4f..062e7d510d54c0 100644
--- a/paddle/fluid/operators/bincount_op.cc
+++ b/paddle/fluid/operators/bincount_op.cc
@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/bincount_op.h"
-
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -28,51 +31,6 @@ class BincountOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of BincountOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of BincountOp should not be null."));
-
-    auto input_dim = ctx->GetInputDim("X");
-    auto minlength = ctx->Attrs().Get<int>("minlength");
-
-    PADDLE_ENFORCE_GE(minlength, 0,
-                      platform::errors::InvalidArgument(
-                          "The minlength should be greater than or equal to 0."
-                          "But received minlength is %d",
-                          minlength));
-
-    PADDLE_ENFORCE_EQ(input_dim.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "The 'shape' of Input(X) must be 1-D tensor."
-                          "But the dimension of Input(X) is [%d]",
-                          input_dim.size()));
-
-    if (ctx->HasInput("Weights")) {
-      auto weights_dim = ctx->GetInputDim("Weights");
-      PADDLE_ENFORCE_EQ(weights_dim.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The 'shape' of Input(Weights) must be 1-D tensor."
-                            "But the dimension of Input(Weights) is [%d]",
-                            weights_dim.size()));
-
-      PADDLE_ENFORCE_EQ(
-          weights_dim[0], input_dim[0],
-          platform::errors::InvalidArgument(
-              "The 'shape' of Input(Weights) must be equal to the 'shape' of "
-              "Input(X)."
-              "But received: the 'shape' of Input(Weights) is [%s],"
-              "the 'shape' of Input(X) is [%s]",
-              weights_dim, input_dim));
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim({-1}));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const {
     auto data_type =
@@ -105,12 +63,10 @@ class BincountOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(bincount, BincountInferShapeFunctor,
+                            PD_INFER_META(phi::BincountInferMeta));
 REGISTER_OPERATOR(
     bincount, ops::BincountOp, ops::BincountOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    bincount, ops::BincountKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BincountKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    BincountInferShapeFunctor);
diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu
deleted file mode 100644
index cc576d0af92877..00000000000000
--- a/paddle/fluid/operators/bincount_op.cu
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/bincount_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-inline int GET_BLOCKS(const int N) {
-  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
-}
-
-template <typename T, typename InputT, typename OutT>
-__global__ void KernelBincount(const InputT* input, const int total_elements,
-                               const bool has_weights, const T* weights,
-                               OutT* output) {
-  if (!has_weights) {
-    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&output[input[i]], 1L);
-    }
-  } else {
-    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&output[input[i]],
-                                      static_cast<OutT>(weights[i]));
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename InputT>
-void BincountCUDAInner(const framework::ExecutionContext& context) {
-  const Tensor* input = context.Input<framework::Tensor>("X");
-  const Tensor* weights = context.Input<framework::Tensor>("Weights");
-  Tensor* output = context.Output<framework::Tensor>("Out");
-  auto& minlength = context.Attr<int>("minlength");
-
-  const InputT* input_data = input->data<InputT>();
-
-  const int input_numel = input->numel();
-
-  if (input_data == nullptr) {
-    framework::DDim out_dim{0};
-    output->Resize(out_dim);
-    output->mutable_data<T>(context.GetPlace());
-    return;
-  }
-  auto input_x = framework::EigenVector<InputT>::Flatten(*input);
-
-  framework::Tensor input_min_t, input_max_t;
-  auto* input_max_data =
-      input_max_t.mutable_data<InputT>({1}, context.GetPlace());
-  auto* input_min_data =
-      input_min_t.mutable_data<InputT>({1}, context.GetPlace());
-
-  auto input_max_scala = framework::EigenScalar<InputT>::From(input_max_t);
-  auto input_min_scala = framework::EigenScalar<InputT>::From(input_min_t);
-
-  auto* place = context.template device_context<DeviceContext>().eigen_device();
-  input_max_scala.device(*place) = input_x.maximum();
-  input_min_scala.device(*place) = input_x.minimum();
-
-  Tensor input_min_cpu, input_max_cpu;
-  paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(),
-                                    &input_max_cpu);
-  paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(),
-                                    &input_min_cpu);
-
-  InputT input_min = input_min_cpu.data<InputT>()[0];
-
-  PADDLE_ENFORCE_GE(
-      input_min, static_cast<InputT>(0),
-      platform::errors::InvalidArgument(
-          "The elements in input tensor must be non-negative ints"));
-
-  int64_t output_size =
-      static_cast<int64_t>(input_max_cpu.data<InputT>()[0]) + 1L;
-
-  output_size = std::max(output_size, static_cast<int64_t>(minlength));
-  framework::DDim out_dim{output_size};
-  output->Resize(out_dim);
-
-  bool has_weights = (weights != nullptr);
-
-  const T* weights_data = has_weights ? weights->data<T>() : nullptr;
-
-  auto stream =
-      context.template device_context<platform::CUDADeviceContext>().stream();
-
-  if (!has_weights) {
-    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, int64_t>()(
-        context.template device_context<DeviceContext>(), output, 0L);
-
-    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        input_data, input_numel, has_weights, weights_data, output_data);
-  } else {
-    const auto& weights_type = framework::TransToProtoVarType(weights->dtype());
-
-    if (weights_type == framework::proto::VarType::FP32) {
-      float* output_data = output->mutable_data<float>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, float>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<float>(0));
-
-      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
-    } else {
-      double* output_data = output->mutable_data<double>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, double>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<double>(0));
-
-      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
-                                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          input_data, input_numel, has_weights, weights_data, output_data);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class BincountCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<framework::Tensor>("X");
-    const auto& input_type = framework::TransToProtoVarType(input->dtype());
-
-    if (input_type == framework::proto::VarType::INT32) {
-      BincountCUDAInner<DeviceContext, T, int>(context);
-    } else if (input_type == framework::proto::VarType::INT64) {
-      BincountCUDAInner<DeviceContext, T, int64_t>(context);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    bincount, ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BincountCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h
deleted file mode 100644
index 84256bf78e4a19..00000000000000
--- a/paddle/fluid/operators/bincount_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T, typename InputT>
-void BincountInner(const framework::ExecutionContext& context) {
-  const Tensor* input = context.Input<framework::Tensor>("X");
-  const Tensor* weights = context.Input<framework::Tensor>("Weights");
-  Tensor* output = context.Output<framework::Tensor>("Out");
-  auto& minlength = context.Attr<int>("minlength");
-
-  const InputT* input_data = input->data<InputT>();
-
-  auto input_numel = input->numel();
-
-  if (input_data == nullptr) {
-    framework::DDim out_dim{0};
-    output->Resize(out_dim);
-    output->mutable_data<InputT>(context.GetPlace());
-    return;
-  }
-
-  PADDLE_ENFORCE_GE(
-      *std::min_element(input_data, input_data + input_numel),
-      static_cast<InputT>(0),
-      platform::errors::InvalidArgument(
-          "The elements in input tensor must be non-negative ints"));
-
-  int64_t output_size = static_cast<int64_t>(*std::max_element(
-                            input_data, input_data + input_numel)) +
-                        1L;
-  output_size = std::max(output_size, static_cast<int64_t>(minlength));
-
-  framework::DDim out_dim{output_size};
-  output->Resize(out_dim);
-
-  bool has_weights = (weights != nullptr);
-
-  if (has_weights) {
-    const T* weights_data = weights->data<T>();
-    const auto& weights_type = framework::TransToProtoVarType(weights->dtype());
-    if (weights_type == framework::proto::VarType::FP32) {
-      float* output_data = output->mutable_data<float>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, float>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<float>(0));
-      for (int64_t i = 0; i < input_numel; i++) {
-        output_data[input_data[i]] += static_cast<float>(weights_data[i]);
-      }
-    } else {
-      double* output_data = output->mutable_data<double>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, double>()(
-          context.template device_context<DeviceContext>(), output,
-          static_cast<double>(0));
-      for (int64_t i = 0; i < input_numel; i++) {
-        output_data[input_data[i]] += static_cast<double>(weights_data[i]);
-      }
-    }
-
-  } else {
-    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, int64_t>()(
-        context.template device_context<DeviceContext>(), output, 0L);
-    for (int64_t i = 0; i < input_numel; i++) {
-      output_data[input_data[i]] += 1L;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class BincountKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<framework::Tensor>("X");
-    const auto& input_type = framework::TransToProtoVarType(input->dtype());
-
-    if (input_type == framework::proto::VarType::INT32) {
-      BincountInner<DeviceContext, T, int>(context);
-    } else if (input_type == framework::proto::VarType::INT64) {
-      BincountInner<DeviceContext, T, int64_t>(context);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index c3917fad555cb4..1063a8b7992153 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -167,9 +167,9 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
+DECLARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
                             BroadcastTensorsInferShapeFunctor,
-                            PT_INFER_META(phi::BroadcastTensorsInferMeta));
+                            PD_INFER_META(phi::BroadcastTensorsInferMeta));
 
 REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp,
                   ops::BroadcastTensorsOpMaker,
diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc
index 09e915a6bafd4a..ed80ac076c0af7 100644
--- a/paddle/fluid/operators/cholesky_op.cc
+++ b/paddle/fluid/operators/cholesky_op.cc
@@ -90,8 +90,8 @@ class CholeskyGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor,
-                            PT_INFER_META(phi::CholeskyInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor,
+                            PD_INFER_META(phi::CholeskyInferMeta));
 REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker,
                   ops::CholeskyGradOpMaker<paddle::framework::OpDesc>,
                   ops::CholeskyGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index c0968581acda99..7206dd01bcaa3e 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index 31b00a93f13965..0946ad8aca65e2 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index 9c11704704ed42..61e5f279034779 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index 5787090e6a52f2..cf4d6a28744b36 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index c79b2f92b69a1e..c4e410d04da5fb 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index d9a7a4abb08fc8..8b498787c69db0 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index b8abf458c1c6d3..133085ad3f3b0f 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
index bb78971734bf05..36c6f4fadd0fcc 100644
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index 8f7b8c4a9040be..6e02d362156970 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index c40b2c3e76a02c..57e3dd53cc7748 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 1da7798ea26965..059fafa3e7f4d4 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -205,8 +205,8 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
-                            PT_INFER_META(phi::ConcatInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
+                            PD_INFER_META(phi::ConcatInferMeta));
 
 REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
                   ops::ConcatGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc
index 95135ba3b1a3db..cbec1182f20b88 100644
--- a/paddle/fluid/operators/conj_op.cc
+++ b/paddle/fluid/operators/conj_op.cc
@@ -66,8 +66,8 @@ class ConjGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker,
                   ops::ConjGradMaker<paddle::framework::OpDesc>,
                   ops::ConjGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc
index 9f229e6f15c218..dd407f4f6f3c51 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cc
@@ -58,8 +58,8 @@ class CompareReduceOp : public framework::OperatorWithKernel {
   };                                                                       \
   char _##op_type##Comment::type[]{#op_type};                              \
   char _##op_type##Comment::equation[]{_equation};                         \
-  DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,        \
-                              PT_INFER_META(phi::CompareAllInferMeta));    \
+  DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,        \
+                              PD_INFER_META(phi::CompareAllInferMeta));    \
   REGISTER_OPERATOR(                                                       \
       op_type, ::paddle::operators::CompareReduceOp<_##op_type##Comment>,  \
       ::paddle::operators::CompareReduceOpProtoMaker<_##op_type##Comment>, \
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 5d9cdc617690f0..72d81d8c3fdf28 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -96,8 +96,8 @@ class CompareOp : public framework::OperatorWithKernel {
   };                                                                     \
   char _##op_type##Comment::type[]{#op_type};                            \
   char _##op_type##Comment::equation[]{_equation};                       \
-  DELCARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,      \
-                              PT_INFER_META(phi::CompareInferMeta));     \
+  DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor,      \
+                              PD_INFER_META(phi::CompareInferMeta));     \
   REGISTER_OPERATOR(                                                     \
       op_type, ::paddle::operators::CompareOp<_##op_type##Comment>,      \
       ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>,     \
diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc
index fe00ee06603f0e..674b75625d1983 100644
--- a/paddle/fluid/operators/cross_op.cc
+++ b/paddle/fluid/operators/cross_op.cc
@@ -109,8 +109,8 @@ class CrossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor,
-                            PT_INFER_META(phi::CrossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor,
+                            PD_INFER_META(phi::CrossInferMeta));
 REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker,
                   ops::CrossGradMaker<paddle::framework::OpDesc>,
                   ops::CrossGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 7c80917a71369e..11633fb0b87032 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,17 +24,6 @@ namespace operators {
 class CumOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->Attrs().Get<bool>("flatten")) {
-      ctx->SetOutputDim("Out",
-                        phi::make_ddim({phi::product(ctx->GetInputDim("X"))}));
-    } else {
-      ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    }
-
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -87,10 +79,12 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-
+DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor,
+                            PD_INFER_META(phi::CumsumInferMeta));
 REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker,
                   ops::CumsumGradMaker<paddle::framework::OpDesc>,
-                  ops::CumsumGradMaker<paddle::imperative::OpBase>);
+                  ops::CumsumGradMaker<paddle::imperative::OpBase>,
+                  CumsumInferShapeFunctor);
 
 REGISTER_OP_VERSION(cumsum)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 1ebafa54598574..568c7982cfc7c0 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -62,7 +62,7 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
 detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
-detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
+detection_library(yolo_box_op SRCS yolo_box_op.cc)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
 detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 511d8e0eed1065..0d9fbf612f73c4 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -9,7 +9,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
@@ -240,8 +239,6 @@ REGISTER_OPERATOR(
     yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
-                       ops::YoloBoxKernel<double>);
 
 REGISTER_OP_VERSION(yolo_box)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
deleted file mode 100644
index fb5c214a59e127..00000000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/detection/yolo_box_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
-                            T* scores, const float conf_thresh,
-                            const int* anchors, const int n, const int h,
-                            const int w, const int an_num, const int class_num,
-                            const int box_num, int input_size_h,
-                            int input_size_w, bool clip_bbox, const float scale,
-                            const float bias, bool iou_aware,
-                            const float iou_aware_factor) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  T box[4];
-  for (; tid < n * box_num; tid += stride) {
-    int grid_num = h * w;
-    int i = tid / box_num;
-    int j = (tid % box_num) / grid_num;
-    int k = (tid % grid_num) / w;
-    int l = tid % w;
-
-    int an_stride = (5 + class_num) * grid_num;
-    int img_height = imgsize[2 * i];
-    int img_width = imgsize[2 * i + 1];
-
-    int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4,
-                                iou_aware);
-    T conf = sigmoid<T>(input[obj_idx]);
-    if (iou_aware) {
-      int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
-      T iou = sigmoid<T>(input[iou_idx]);
-      conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
-             pow(iou, static_cast<T>(iou_aware_factor));
-    }
-    if (conf < conf_thresh) {
-      continue;
-    }
-
-    int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0,
-                                iou_aware);
-    GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
-                  input_size_w, box_idx, grid_num, img_height, img_width, scale,
-                  bias);
-    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
-    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
-
-    int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num,
-                                  5, iou_aware);
-    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
-    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
-                      grid_num);
-  }
-}
-
-template <typename T>
-class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* img_size = ctx.Input<Tensor>("ImgSize");
-    auto* boxes = ctx.Output<Tensor>("Boxes");
-    auto* scores = ctx.Output<Tensor>("Scores");
-
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float conf_thresh = ctx.Attr<float>("conf_thresh");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    bool clip_bbox = ctx.Attr<bool>("clip_bbox");
-    bool iou_aware = ctx.Attr<bool>("iou_aware");
-    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
-    float scale = ctx.Attr<float>("scale_x_y");
-    float bias = -0.5 * (scale - 1.);
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int box_num = boxes->dims()[1];
-    const int an_num = anchors.size() / 2;
-    int input_size_h = downsample_ratio * h;
-    int input_size_w = downsample_ratio * w;
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    int bytes = sizeof(int) * anchors.size();
-    auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size());
-    int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
-    const auto gplace = ctx.GetPlace();
-    const auto cplace = platform::CPUPlace();
-    memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
-                 dev_ctx.stream());
-
-    const T* input_data = input->data<T>();
-    const int* imgsize_data = img_size->data<int>();
-    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
-    T* scores_data =
-        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(dev_ctx, boxes, static_cast<T>(0));
-    set_zero(dev_ctx, scores, static_cast<T>(0));
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num);
-
-    dim3 thread_num = config.thread_per_block;
-#ifdef WITH_NV_JETSON
-    if (config.compute_capability == 53 || config.compute_capability == 62) {
-      thread_num = 512;
-    }
-#endif
-
-    KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0,
-                     ctx.cuda_device_context().stream()>>>(
-        input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
-        anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,
-        input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel<float>,
-                        ops::YoloBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
deleted file mode 100644
index 2cd69c60b7c44d..00000000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-HOSTDEVICE inline T sigmoid(T x) {
-  return 1.0 / (1.0 + std::exp(-x));
-}
-
-template <typename T>
-HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
-                                  int j, int an_idx, int grid_size_h,
-                                  int grid_size_w, int input_size_h,
-                                  int input_size_w, int index, int stride,
-                                  int img_height, int img_width, float scale,
-                                  float bias) {
-  box[0] = (i + sigmoid<T>(x[index]) * scale + bias) * img_width / grid_size_w;
-  box[1] = (j + sigmoid<T>(x[index + stride]) * scale + bias) * img_height /
-           grid_size_h;
-  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
-           input_size_w;
-  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
-           img_height / input_size_h;
-}
-
-HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
-                                    int an_num, int an_stride, int stride,
-                                    int entry, bool iou_aware) {
-  if (iou_aware) {
-    return (batch * an_num + an_idx) * an_stride +
-           (batch * an_num + an_num + entry) * stride + hw_idx;
-  } else {
-    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
-  }
-}
-
-HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num,
-                                  int an_stride, int stride) {
-  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
-         hw_idx;
-}
-
-template <typename T>
-HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx,
-                                        const int img_height,
-                                        const int img_width, bool clip_bbox) {
-  boxes[box_idx] = box[0] - box[2] / 2;
-  boxes[box_idx + 1] = box[1] - box[3] / 2;
-  boxes[box_idx + 2] = box[0] + box[2] / 2;
-  boxes[box_idx + 3] = box[1] + box[3] / 2;
-
-  if (clip_bbox) {
-    boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
-    boxes[box_idx + 1] =
-        boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
-    boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
-                             ? boxes[box_idx + 2]
-                             : static_cast<T>(img_width - 1);
-    boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
-                             ? boxes[box_idx + 3]
-                             : static_cast<T>(img_height - 1);
-  }
-}
-
-template <typename T>
-HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input,
-                                      const int label_idx, const int score_idx,
-                                      const int class_num, const T conf,
-                                      const int stride) {
-  for (int i = 0; i < class_num; i++) {
-    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
-  }
-}
-
-template <typename T>
-class YoloBoxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* imgsize = ctx.Input<Tensor>("ImgSize");
-    auto* boxes = ctx.Output<Tensor>("Boxes");
-    auto* scores = ctx.Output<Tensor>("Scores");
-    auto anchors = ctx.Attr<std::vector<int>>("anchors");
-    int class_num = ctx.Attr<int>("class_num");
-    float conf_thresh = ctx.Attr<float>("conf_thresh");
-    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
-    bool clip_bbox = ctx.Attr<bool>("clip_bbox");
-    bool iou_aware = ctx.Attr<bool>("iou_aware");
-    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
-    float scale = ctx.Attr<float>("scale_x_y");
-    float bias = -0.5 * (scale - 1.);
-
-    const int n = input->dims()[0];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-    const int box_num = boxes->dims()[1];
-    const int an_num = anchors.size() / 2;
-    int input_size_h = downsample_ratio * h;
-    int input_size_w = downsample_ratio * w;
-
-    const int stride = h * w;
-    const int an_stride = (class_num + 5) * stride;
-
-    Tensor anchors_;
-    auto anchors_data =
-        anchors_.mutable_data<int>({an_num * 2}, ctx.GetPlace());
-    std::copy(anchors.begin(), anchors.end(), anchors_data);
-
-    const T* input_data = input->data<T>();
-    const int* imgsize_data = imgsize->data<int>();
-    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
-    memset(boxes_data, 0, boxes->numel() * sizeof(T));
-    T* scores_data =
-        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    memset(scores_data, 0, scores->numel() * sizeof(T));
-
-    T box[4];
-    for (int i = 0; i < n; i++) {
-      int img_height = imgsize_data[2 * i];
-      int img_width = imgsize_data[2 * i + 1];
-
-      for (int j = 0; j < an_num; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                        stride, 4, iou_aware);
-            T conf = sigmoid<T>(input_data[obj_idx]);
-            if (iou_aware) {
-              int iou_idx =
-                  GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
-              T iou = sigmoid<T>(input_data[iou_idx]);
-              conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
-                     pow(iou, static_cast<T>(iou_aware_factor));
-            }
-            if (conf < conf_thresh) {
-              continue;
-            }
-
-            int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                        stride, 0, iou_aware);
-            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, w,
-                          input_size_h, input_size_w, box_idx, stride,
-                          img_height, img_width, scale, bias);
-            box_idx = (i * box_num + j * stride + k * w + l) * 4;
-            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width,
-                                clip_bbox);
-
-            int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                                          stride, 5, iou_aware);
-            int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
-            CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
-                              class_num, conf, stride);
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index 375ef4344f4741..f89ecd37222870 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -19,11 +19,17 @@
 #include <cmath>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -172,7 +178,7 @@ template <typename DeviceContext, typename T>
 class DeterminantGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto& orig_dev_ctx = context.template device_context<DeviceContext>();
     const auto* input = context.Input<framework::Tensor>("Input");
     const auto* det = context.Input<framework::Tensor>("Out");
     const auto* grad =
@@ -200,15 +206,18 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
       // checked in forward, pass
     }
 
+    auto& dev_ctx = static_cast<
+        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+        orig_dev_ctx);
+
     // Check Whether the matrix is invertible
     // (matrix A not invertible) == (det(A)=0)
     if (!CheckMatrixInvertible<DeviceContext, T>(context, det)) {
       // The matrix is not invertible
       VLOG(3) << "The input matrix not invertible!";
       ddet->Resize(input->dims());
-      ddet->mutable_data<T>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> zero;
-      zero(dev_ctx, ddet, static_cast<T>(0.0f));
+      phi::Full<T>(dev_ctx, phi::vectorize(input->dims()), static_cast<T>(0.0f),
+                   ddet);
       return;
     }
 
@@ -218,35 +227,35 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
     // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
     // -1)
 
-    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
-
     // First: inverse(A)
     framework::Tensor inverse_A;
     // A must be square matrices!
     inverse_A.Resize(input->dims());
     inverse_A.mutable_data<T>(context.GetPlace());
 
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *input, &inverse_A);
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    mat_inv(orig_dev_ctx, *input, &inverse_A);
 
     VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
 
     // Second: inverse(A).transpose(-2, -1)
-    framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A);
+    framework::Tensor transpose_inverse_A =
+        phi::TransposeLast2Dim<T>(dev_ctx, inverse_A);
+
     VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
             << transpose_inverse_A.dims();
 
     // Third: dA * |A|
-    auto mul_dA_detA = helper.Mul(*grad, *det);
+    auto mul_dA_detA = phi::Multiply<T>(dev_ctx, *grad, *det);
     VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
 
     // Fourth: unsqueeze(dA * |A|, [-1, -2])
-    auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1);
-    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1);
+    auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
     VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
 
     // Finally: unsqueeze(dA * |A|) * inverse(A)
-    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+    auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
 
     VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
 
@@ -331,7 +340,7 @@ template <typename DeviceContext, typename T>
 class SlogDeterminantGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto& orig_dev_ctx = context.template device_context<DeviceContext>();
     const auto* input = context.Input<framework::Tensor>("Input");
     const auto* slogdet = context.Input<framework::Tensor>("Out");
     const auto* grad =
@@ -353,6 +362,10 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
               input->dims().size() - grad->dims().size()));
     }
 
+    auto& dev_ctx = static_cast<
+        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+        orig_dev_ctx);
+
     // Check Whether the matrix is invertible
     // (matrix A not invertible) == (absslogdet(A)=0)
     auto slogdet_vec = slogdet->Split(1, 0);
@@ -361,9 +374,8 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
       // The matrix is not invertible
       VLOG(3) << "The input matrix not invertible!";
       dslogdet->Resize(input->dims());
-      dslogdet->mutable_data<T>(context.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> zero;
-      zero(dev_ctx, dslogdet, std::numeric_limits<T>::quiet_NaN());
+      phi::Full<T>(dev_ctx, phi::vectorize(input->dims()),
+                   std::numeric_limits<T>::quiet_NaN(), dslogdet);
       return;
     }
 
@@ -373,34 +385,25 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     // we set dsl|A| = unsqueeze(dslA, [-1, -2]) *
     // inverse(A).conj().transpose(-2, -1)
 
-    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
-
     // First: inverse(A)
     framework::Tensor inverse_A;
     // A must be square matrices!
     inverse_A.Resize(input->dims());
     inverse_A.mutable_data<T>(context.GetPlace());
 
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *input, &inverse_A);
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    mat_inv(orig_dev_ctx, *input, &inverse_A);
 
     VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
 
     // Second: inverse(A).conj()
-    framework::Tensor conj_inverse_A;
-    conj_inverse_A.Resize(inverse_A.dims());
-    auto numel = input->numel();
-    auto* conj_data = conj_inverse_A.mutable_data<T>(context.GetPlace(),
-                                                     size_t(numel * sizeof(T)));
-
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    phi::funcs::ConjFunctor<T> functor(inverse_A.data<T>(), numel, conj_data);
-    for_range(functor);
+    auto conj_inverse_A = phi::Conj<T>(dev_ctx, inverse_A);
 
     VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims();
 
     // Third: inverse(A).conj().transpose(-2, -1)
-    framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A);
+    framework::Tensor transpose_inverse_A =
+        phi::TransposeLast2Dim<T>(dev_ctx, conj_inverse_A);
     VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: "
             << transpose_inverse_A.dims();
 
@@ -417,12 +420,12 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     det_grad.Resize(det_grad.dims().reshape(det_grad_vec));
 
     // Fifth: unsqueeze(dslA, [-1, -2])
-    auto unsqueeze1 = helper.Unsqueeze(det_grad, -1);
-    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    auto unsqueeze1 = phi::funcs::Unsqueeze(det_grad, -1);
+    auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
     VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims();
 
     // Finally: unsqueeze(dslA) * inverse(A)
-    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+    auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
     VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims();
 
     framework::TensorCopy(res, context.GetPlace(), dslogdet);
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
index 0160277dc79af5..93fbff67e220bc 100644
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -62,8 +62,8 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
-                            PT_INFER_META(phi::DiagInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor,
+                            PD_INFER_META(phi::DiagInferMeta));
 
 REGISTER_OPERATOR(
     diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker,
diff --git a/paddle/fluid/operators/diagonal_op.cc b/paddle/fluid/operators/diagonal_op.cc
index 20813f8bb44e20..bf3cc941539eae 100644
--- a/paddle/fluid/operators/diagonal_op.cc
+++ b/paddle/fluid/operators/diagonal_op.cc
@@ -105,8 +105,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor,
-                            PT_INFER_META(phi::DiagonalInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor,
+                            PD_INFER_META(phi::DiagonalInferMeta));
 
 REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker,
                   ops::DiagonalGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc
index 3a53f1365567f9..55b2484941293c 100644
--- a/paddle/fluid/operators/dist_op.cc
+++ b/paddle/fluid/operators/dist_op.cc
@@ -12,10 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/dist_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -121,13 +124,11 @@ class DistGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor,
+                            PD_INFER_META(phi::DistInferMeta));
+
 REGISTER_OPERATOR(dist, ops::DistOp, ops::DistOpMaker,
                   ops::DistGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DistGradOpMaker<paddle::imperative::OpBase>);
+                  ops::DistGradOpMaker<paddle::imperative::OpBase>,
+                  DistInferShapeFunctor);
 REGISTER_OPERATOR(dist_grad, ops::DistOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DistKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DistGradKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
deleted file mode 100644
index dfd7e29a8d0102..00000000000000
--- a/paddle/fluid/operators/dist_op.h
+++ /dev/null
@@ -1,304 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <math.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::Tensor;
-
-template <int Rank>
-static void GetBraodcastDims(const framework::DDim& x_dims,
-                             const framework::DDim& y_dims,
-                             Eigen::DSizes<int, Rank>* x_bcast_dims,
-                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
-  int bcast_dims_remainder = 0;
-  for (int i = 0; i < x_dims.size(); ++i) {
-    if (x_dims[i] >= y_dims[i]) {
-      (*x_bcast_dims)[i] = 1;
-      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
-      bcast_dims_remainder += x_dims[i] % y_dims[i];
-    } else {
-      (*y_bcast_dims)[i] = 1;
-      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
-      bcast_dims_remainder += y_dims[i] % x_dims[i];
-    }
-  }
-  PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0,
-                    platform::errors::PreconditionNotMet(
-                        "The input tensor of Op(dist) could not be broadcast, "
-                        "X's shape is [%s], Y's shape is [%s].",
-                        x_dims, y_dims));
-}
-
-static framework::DDim GetNewDims(const framework::DDim& in_dims, int rank) {
-  std::vector<int64_t> new_dims_vec(rank);
-  if (in_dims.size() < rank) {
-    for (int i = 0; i < rank - in_dims.size(); ++i) {
-      new_dims_vec[i] = 1;
-    }
-    for (int i = 0; i < in_dims.size(); ++i) {
-      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
-    }
-  } else {
-    new_dims_vec = vectorize(in_dims);
-  }
-  return phi::make_ddim(new_dims_vec);
-}
-
-template <typename DeviceContext, typename T, int Rank>
-static void DistFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* y = context.Input<Tensor>("Y");
-  auto* out = context.Output<Tensor>("Out");
-  auto p = context.Attr<float>("p");
-  out->mutable_data<T>(context.GetPlace());
-
-  auto x_dims = context.Input<Tensor>("X")->dims();
-  auto y_dims = context.Input<Tensor>("Y")->dims();
-
-  // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3))
-  framework::DDim x_new_dims = GetNewDims(x_dims, Rank);
-  framework::DDim y_new_dims = GetNewDims(y_dims, Rank);
-
-  auto x_t = EigenTensor<T, Rank>::From(*x, x_new_dims);
-  auto y_t = EigenTensor<T, Rank>::From(*y, y_new_dims);
-  auto out_t = EigenTensor<T, 1>::From(*out);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-
-  Eigen::DSizes<int, Rank> x_bcast_dims;
-  Eigen::DSizes<int, Rank> y_bcast_dims;
-  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
-  // p=0 means number of non-zero elements of (x-y)
-  // p=inf means the maximum of |x-y|
-  // p=-inf means the minimum of |x-y|
-  // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p)
-  if (p == 0) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims))
-            .template cast<T>()
-            .sum();
-  } else if (p == INFINITY) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .maximum();
-  } else if (p == -INFINITY) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .minimum();
-  } else {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .pow(p)
-            .sum()
-            .pow(1.0 / p);
-  }
-}
-
-template <typename DeviceContext, typename T, int Rank>
-static void DistGradFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* y = context.Input<Tensor>("Y");
-  auto* out = context.Input<Tensor>("Out");
-  auto p = context.Attr<float>("p");
-
-  auto x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-  auto y_grad = context.Output<Tensor>(framework::GradVarName("Y"));
-  auto out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-
-  auto x_dims = context.Input<Tensor>("X")->dims();
-  auto y_dims = context.Input<Tensor>("Y")->dims();
-  auto out_dims = context.Input<Tensor>("Out")->dims();
-
-  framework::DDim x_new_dims = GetNewDims(x_dims, Rank);
-  framework::DDim y_new_dims = GetNewDims(y_dims, Rank);
-  framework::DDim out_new_dims = GetNewDims(out_dims, Rank);
-  auto x_t = EigenTensor<T, Rank>::From(*x, x_new_dims);
-  auto y_t = EigenTensor<T, Rank>::From(*y, y_new_dims);
-  auto out_t = EigenTensor<T, Rank>::From(*out, out_new_dims);
-
-  Eigen::DSizes<int, Rank> x_bcast_dims;
-  Eigen::DSizes<int, Rank> y_bcast_dims;
-  Eigen::DSizes<int, Rank> out_bcast_dims;
-
-  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
-  std::vector<int64_t> new_dims_vec(Rank);
-  for (int i = 0; i < Rank; ++i) {
-    new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]);
-    out_bcast_dims[i] = new_dims_vec[i];
-  }
-  framework::DDim new_dims = phi::make_ddim(new_dims_vec);
-
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  auto out_grad_t = EigenTensor<T, Rank>::From(*out_grad, out_new_dims);
-  framework::Tensor grad;
-  grad.mutable_data<T>(new_dims, context.GetPlace());
-  auto grad_t = EigenTensor<T, Rank>::From(grad);
-
-  auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims);
-  auto x_minux_y_abs = x_minux_y.abs();
-  auto sign =
-      (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
-      (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
-  T epsilon = static_cast<T>(1.0e-10f);
-
-  // 1: Lp-norm(z), z = x-y, compute dz
-  if (p == 0) {
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, &grad, static_cast<T>(0));
-  } else if (p == INFINITY || p == -INFINITY) {
-    // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
-    // j!=i, or equals to sign(z_i) * dout if j=i.
-    if (platform::is_cpu_place(context.GetPlace())) {
-      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
-                                 .template cast<T>() *
-                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
-    } else {
-      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
-                                 .template cast<T>() *
-                             sign * out_grad_t.broadcast(out_bcast_dims);
-    }
-  } else {
-    // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
-    if (platform::is_cpu_place(context.GetPlace())) {
-      grad_t.device(place) =
-          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
-              .pow(p - 1) *
-          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
-    } else {
-      grad_t.device(place) =
-          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
-              .pow(p - 1) *
-          sign * out_grad_t.broadcast(out_bcast_dims);
-    }
-  }
-
-  Eigen::DSizes<int, Rank * 2> x_reshape_dims;
-  Eigen::DSizes<int, Rank * 2> y_reshape_dims;
-  Eigen::DSizes<int, Rank> reduce_dims;
-  for (int i = 0; i < x_new_dims.size(); ++i) {
-    x_reshape_dims[2 * i] = x_bcast_dims[i];
-    x_reshape_dims[2 * i + 1] = x_new_dims[i];
-    y_reshape_dims[2 * i] = y_bcast_dims[i];
-    y_reshape_dims[2 * i + 1] = y_new_dims[i];
-    reduce_dims[i] = 2 * i;
-  }
-
-  // 2: if x or y is broadcasted in forward function,
-  // the grad need to be sum along the broadcasted dimensions
-  if (x_grad) {
-    x_grad->mutable_data<T>(context.GetPlace());
-    auto x_grad_t = EigenTensor<T, Rank>::From(*x_grad, x_new_dims);
-    x_grad_t.device(place) = grad_t.reshape(x_reshape_dims)
-                                 .sum(reduce_dims)
-                                 .reshape(x_grad_t.dimensions());
-  }
-  if (y_grad) {
-    y_grad->mutable_data<T>(context.GetPlace());
-    auto y_grad_t = EigenTensor<T, Rank>::From(*y_grad, y_new_dims);
-    y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims)
-                                  .sum(reduce_dims)
-                                  .reshape(y_grad_t.dimensions());
-  }
-}
-
-template <typename DeviceContext, typename T>
-class DistKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto x_rank = context.Input<Tensor>("X")->dims().size();
-    auto y_rank = context.Input<Tensor>("Y")->dims().size();
-    auto rank = std::max(x_rank, y_rank);
-    PADDLE_ENFORCE_LE(rank, 6,
-                      platform::errors::Unimplemented(
-                          "Op(dist) only support tensors with no more than 6 "
-                          "dimensions, but X's rank is %d, Y's rank is %d.",
-                          x_rank, y_rank));
-    switch (rank) {
-      case 1:
-        DistFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        DistFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        DistFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        DistFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        DistFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        DistFunction<DeviceContext, T, 6>(context);
-        break;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DistGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto x_rank = context.Input<Tensor>("X")->dims().size();
-    auto y_rank = context.Input<Tensor>("Y")->dims().size();
-    auto rank = std::max(x_rank, y_rank);
-    PADDLE_ENFORCE_LE(rank, 6,
-                      platform::errors::Unimplemented(
-                          "Op(dist) only support tensors with no more than 6 "
-                          "dimensions, but X's rank is %d, Y's rank is %d.",
-                          x_rank, y_rank));
-    switch (rank) {
-      case 1:
-        DistGradFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        DistGradFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        DistGradFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        DistGradFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        DistGradFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        DistGradFunction<DeviceContext, T, 6>(context);
-        break;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
index a86a3bb35927d5..8efdd15781a6f2 100644
--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -101,8 +101,8 @@ class DotOpGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
-                            PT_INFER_META(phi::DotInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
+                            PD_INFER_META(phi::DotInferMeta));
 
 REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
                   ops::DotOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index dcdab033e8f801..144198367d538e 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -32,10 +32,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/dropout_impl_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
 namespace paddle {
@@ -177,12 +176,13 @@ __global__ void DropoutGradCUDAKernel(
 }
 
 template <typename T>
-void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                              bool is_test,
+void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
                               const std::string dropout_implementation,
                               float dropout_prob, bool upscale_in_train,
-                              bool is_fix_seed, int seed_val, const Tensor& x,
-                              const Tensor* seed, Tensor* mask, Tensor* y) {
+                              bool is_fix_seed, int seed_val,
+                              const framework::Tensor& x,
+                              const framework::Tensor* seed,
+                              framework::Tensor* mask, framework::Tensor* y) {
   auto& place = *dev_ctx.eigen_device();
   int64_t x_numel = x.numel();
   auto stream = dev_ctx.stream();
@@ -220,7 +220,8 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     // VectorizedRandomGenerator use curand_uniform4, so we only support
     // vec_size is 4;
     int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
-    auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
+    auto gpu_config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
     auto offset =
         ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
 
@@ -266,7 +267,8 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
                           cudaMemcpyDeviceToDevice, stream));
 #endif
     } else {
-      T factor = static_cast<T>(1.0f - dropout_prob);
+      using MT = typename details::MPTypeTrait<T>::Type;
+      MT factor = static_cast<MT>(1.0f - dropout_prob);
       std::vector<const framework::Tensor*> ins = {&x};
       std::vector<framework::Tensor*> outs = {y};
       auto functor = phi::funcs::ScaleFunctor<T>(factor);
@@ -277,11 +279,13 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
 }
 
 template <typename T>
-void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
+void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
                                 const std::string dropout_implementation,
-                                float dropout_prob, const Tensor& grad_y,
-                                const Tensor& mask, int64_t size,
-                                Tensor* grad_x, bool is_test = false) {
+                                float dropout_prob,
+                                const framework::Tensor& grad_y,
+                                const framework::Tensor& mask, int64_t size,
+                                framework::Tensor* grad_x,
+                                bool is_test = false) {
   using MT = typename details::MPTypeTrait<T>::Type;
   auto stream = dev_ctx.stream();
   MT factor;
diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
index d7db7dddce3887..c62d45570ba291 100644
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx,
+inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                     const framework::Tensor* seed,
                                     const bool is_fix_seed, const int seed_val,
                                     const int offset, uint64_t* seed_data,
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 7613b04bccfdc2..6d52ce45c4c100 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/dropout_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -177,14 +177,3 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
                   ops::DropoutGradOpMaker<paddle::framework::OpDesc>,
                   ops::DropoutGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
deleted file mode 100644
index f6ddff1d0327d3..00000000000000
--- a/paddle/fluid/operators/dropout_op.cu
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
-template <typename Place, typename T>
-class GPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* seed =
-        context.HasInput("Seed") ? context.Input<Tensor>("Seed") : nullptr;
-    auto* y = context.Output<Tensor>("Out");
-    y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-
-    bool is_test = context.Attr<bool>("is_test");
-
-    auto& dev_ctx = context.cuda_device_context();
-    auto* mask = context.Output<Tensor>("Mask");
-    mask->mutable_data<uint8_t>(context.GetPlace());
-
-    bool is_fix_seed = context.Attr<bool>("fix_seed");
-    int seed_val = context.Attr<int>("seed");
-    DropoutFwGPUKernelDriver<T>(dev_ctx, is_test, dropout_implementation,
-                                dropout_prob, upscale_in_train, is_fix_seed,
-                                seed_val, *x, seed, mask, y);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GPUDropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-    auto size = grad_x->numel();
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    bool is_test = context.Attr<bool>("is_test");
-
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    DropoutGradGPUKernelDriver<T>(dev_ctx, dropout_implementation, dropout_prob,
-                                  *grad_y, *mask, size, grad_x, is_test);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    dropout_grad, ops::GPUDropoutGradKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::GPUDropoutGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
deleted file mode 100644
index ea6ed0e6194747..00000000000000
--- a/paddle/fluid/operators/dropout_op.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <cstring>
-#include <random>
-#include <string>
-
-#include <algorithm>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-class CPUDropoutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* seed =
-        context.HasInput("Seed") ? context.Input<Tensor>("Seed") : nullptr;
-    auto* y = context.Output<Tensor>("Out");
-    const auto* x_data = x->data<T>();
-    auto* y_data = y->mutable_data<T>(context.GetPlace());
-    float dropout_prob = context.Attr<float>("dropout_prob");
-
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    bool upscale_in_train = (dropout_implementation == "upscale_in_train");
-    if (!context.Attr<bool>("is_test")) {
-      auto* mask = context.Output<Tensor>("Mask");
-      auto* mask_data = mask->mutable_data<uint8_t>(context.GetPlace());
-      size_t size = phi::product(mask->dims());
-
-      // Special case when dropout_prob is 1.0
-      if (dropout_prob == 1.0f) {
-        std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
-        std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
-        return;
-      }
-      // std::minstd_rand engine;
-      // NOTE: fixed seed should only be used in unittest or for debug.
-      // Guarantee to use random seed in training.
-      int seed_data = 0;
-      if (seed) {
-        seed_data = *(seed->data<int>());
-      } else {
-        seed_data =
-            context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : 0;
-      }
-      auto engine = framework::GetCPURandomEngine(seed_data);
-
-      std::uniform_real_distribution<float> dist(0, 1);
-
-      for (size_t i = 0; i < size; ++i) {
-        if (dist(*engine) < dropout_prob) {
-          mask_data[i] = 0;
-          y_data[i] = 0;
-        } else {
-          mask_data[i] = 1;
-          if (upscale_in_train) {
-            y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
-          } else {
-            y_data[i] = x_data[i];
-          }
-        }
-      }
-    } else {
-      if (upscale_in_train) {
-        const auto* X_data = x->data<T>();
-        auto* Y_data = y->mutable_data<T>(context.GetPlace());
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-        for (int i = 0; i < x->numel(); i++) {
-          Y_data[i] = X_data[i];
-        }
-      } else {
-        auto X = EigenMatrix<T>::Reshape(*x, 1);
-        auto Y = EigenMatrix<T>::Reshape(*y, 1);
-        auto& place =
-            *context.template device_context<DeviceContext>().eigen_device();
-        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class DropoutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = context.Input<Tensor>("Mask");
-    grad_x->mutable_data<T>(context.GetPlace());
-
-    auto dX = EigenVector<T>::Flatten(*grad_x);
-    auto dY = EigenVector<T>::Flatten(*grad_y);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto& dropout_implementation =
-        context.Attr<std::string>("dropout_implementation");
-    if (context.Attr<bool>("is_test") == true) {
-      if (dropout_implementation == "upscale_in_train") {
-        dX.device(place) = static_cast<T>(1) * dY;
-      } else {
-        float dropout_prob = context.Attr<float>("dropout_prob");
-        dX.device(place) = dY * static_cast<T>(1.0f - dropout_prob);
-      }
-    } else {
-      auto M = EigenVector<uint8_t>::Flatten(*mask);
-      if (dropout_implementation == "upscale_in_train") {
-        float dropout_prob = context.Attr<float>("dropout_prob");
-        if (dropout_prob == 1.0f) {
-          dX.device(place) = static_cast<T>(0) * dY;
-        } else {
-          dX.device(place) =
-              dY * M.cast<T>() / static_cast<T>(1.0f - dropout_prob);
-        }
-      } else {
-        dX.device(place) = dY * M.cast<T>();
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index 6aae566760623c..07b3b538116257 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/core/ddim.h"
 
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index 206d9a6c5e9c98..bdf08646f1d8b9 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -24,14 +24,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 
 void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index 07b7e2cc7c09b0..7d8660f238abc8 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -8,15 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/dropout_op.h"
+
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
 namespace operators {
 
 #ifdef PADDLE_WITH_XPU
 
+using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class DropoutXPUKernel : public framework::OpKernel<T> {
   using XPUTyp = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index e9c6c1eb7eced7..5e4c83e1a45ebd 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -18,12 +18,19 @@
 #include <algorithm>
 #include <complex>
 #include "paddle/fluid/operators/math/matrix_solve.h"
-#include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
 #define EPSILON 1e-6
 
 namespace paddle {
@@ -214,12 +221,17 @@ class EigKernel : public framework::OpKernel<T> {
 
       ApplyEigKernel<DeviceContext, phi::dtype::Real<T>>(
           *x, &real_values, &real_vectors, context);
-      auto dito = math::DeviceIndependenceTensorOperations<
-          DeviceContext, phi::dtype::Real<T>, Tout>(context);
+
+      auto& orig_dev_ctx = context.template device_context<DeviceContext>();
+      auto& dev_ctx = static_cast<
+          const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+          orig_dev_ctx);
 
       // 1. extract real part & imag part from real_values
-      Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order});
-      Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2});
+      Tensor real_part =
+          phi::funcs::Slice<T>(dev_ctx, real_values, {-1}, {0}, {order});
+      Tensor imag_part = phi::funcs::Slice<T>(dev_ctx, real_values, {-1},
+                                              {order}, {order * 2});
 
       // 2. construct complex values
       auto* real_part_data = real_part.data<phi::dtype::Real<T>>();
@@ -233,7 +245,8 @@ class EigKernel : public framework::OpKernel<T> {
       for_range(functor);
 
       // 3. construct complex vectors
-      Tensor real_vector_trans = dito.Transpose(real_vectors);
+      Tensor real_vector_trans =
+          phi::TransposeLast2Dim<T>(dev_ctx, real_vectors);
       Tensor out_vectors_trans;
       out_vectors_trans.mutable_data<Tout>(x->dims(), context.GetPlace());
       ConstructComplexVectors<phi::dtype::Real<T>, Tout>(
@@ -251,45 +264,48 @@ class EigKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename Tout>
+template <typename DeviceContext, typename T>
 void ComputeBackwardForComplexInput(
     const Tensor& V, const Tensor& L, const Tensor& gL, const Tensor& gV,
-    Tout* x_grad_data, int batch_count, int order,
+    T* x_grad_data, int batch_count, int order,
     const framework::ExecutionContext& context) {
-  auto dito =
-      math::DeviceIndependenceTensorOperations<DeviceContext, Tout, Tout>(
-          context);
-
-  Tensor trans_v = dito.Transpose(V);
-  Tensor Vh = dito.Conj(trans_v);
-  Tensor Lconj = dito.Conj(L);
-  Tensor Econj = dito.Sub(dito.Unsqueeze(Lconj, -2), dito.Unsqueeze(Lconj, -1));
-  Tensor VhgV = dito.Matmul(Vh, gV);
-  Tensor diag_real = dito.Real(VhgV);
-  Tensor diag_res = dito.BatchDiag(diag_real, batch_count);
-  Tensor diag_unsqueezed = dito.Unsqueeze(diag_res, -2);
+  auto& orig_dev_ctx = context.template device_context<DeviceContext>();
+  auto& dev_ctx = static_cast<
+      const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
+      orig_dev_ctx);
+
+  Tensor trans_v = phi::TransposeLast2Dim<T>(dev_ctx, V);
+  Tensor Vh = phi::Conj<T>(dev_ctx, trans_v);
+  Tensor Lconj = phi::Conj<T>(dev_ctx, L);
+  Tensor Econj = phi::Subtract<T>(dev_ctx, phi::funcs::Unsqueeze(Lconj, -2),
+                                  phi::funcs::Unsqueeze(Lconj, -1));
+  Tensor VhgV = phi::Matmul<T>(dev_ctx, Vh, gV);
+  Tensor diag_real = phi::Real<T>(dev_ctx, VhgV);
+  Tensor diag_res = phi::funcs::BatchDiag<T>(dev_ctx, diag_real, batch_count);
+  Tensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2);
 
   // turn diag_unsqueezed into complex
   auto numel = diag_unsqueezed.numel();
   Tensor diag_unsqueezed_complex;
-  auto* data_diag_un = diag_unsqueezed.data<phi::dtype::Real<Tout>>();
-  auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data<Tout>(
+  auto* data_diag_un = diag_unsqueezed.data<phi::dtype::Real<T>>();
+  auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data<T>(
       diag_unsqueezed.dims(), context.GetPlace(),
-      static_cast<size_t>(numel * sizeof(Tout)));
-  auto& dev_ctx = context.template device_context<DeviceContext>();
-  platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-  phi::funcs::RealToComplexFunctor<Tout> functor(data_diag_un, data_diag_un_com,
-                                                 numel);
+      static_cast<size_t>(numel * sizeof(T)));
+
+  platform::ForRange<DeviceContext> for_range(orig_dev_ctx, numel);
+  phi::funcs::RealToComplexFunctor<T> functor(data_diag_un, data_diag_un_com,
+                                              numel);
   for_range(functor);
   // real tensor multiply complex tensor in broadcast manner
-  Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex);
-  Tensor res2 = dito.Matmul(Vh, res1);
-  Tensor result = dito.Sub(VhgV, res2);
+  Tensor res1 = phi::Multiply<T>(dev_ctx, V, diag_unsqueezed_complex);
+  Tensor res2 = phi::Matmul<T>(dev_ctx, Vh, res1);
+  Tensor result = phi::Subtract<T>(dev_ctx, VhgV, res2);
 
-  result.mutable_data<Tout>(V.dims(), context.GetPlace());
-  result = dito.Div(result, Econj);
-  result = dito.DiagFill(order, order, order, 0, gL, result);
-  Tensor rhs = dito.Matmul(result, Vh);
+  result.mutable_data<T>(V.dims(), context.GetPlace());
+  result = phi::Divide<T>(dev_ctx, result, Econj);
+  result =
+      phi::funcs::DiagFill<T, T>(dev_ctx, order, order, order, 0, gL, result);
+  Tensor rhs = phi::Matmul<T>(dev_ctx, result, Vh);
 
   // solve linear system
   // solve(Vh, rhs, out, m, k)
@@ -298,10 +314,10 @@ void ComputeBackwardForComplexInput(
   // x_grad: out
   int m = Vh.dims()[Vh.dims().size() - 1];
   int k = rhs.dims()[rhs.dims().size() - 1];
-  auto* matrix_data = Vh.data<Tout>();
-  auto* rhs_data = rhs.data<Tout>();
-  math::SolveLinearSystem<Tout>(matrix_data, rhs_data, x_grad_data, m, k,
-                                batch_count);
+  auto* matrix_data = Vh.data<T>();
+  auto* rhs_data = rhs.data<T>();
+  math::SolveLinearSystem<T>(matrix_data, rhs_data, x_grad_data, m, k,
+                             batch_count);
 }
 
 template <typename DeviceContext, typename T, typename Tout>
diff --git a/paddle/fluid/operators/eigh_op.cc b/paddle/fluid/operators/eigh_op.cc
index 553d0e679cc6dd..4e33c567eb6d12 100644
--- a/paddle/fluid/operators/eigh_op.cc
+++ b/paddle/fluid/operators/eigh_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/eigh_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,42 +25,9 @@ using framework::Tensor;
 class EighOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigh");
-    OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues",
-                   "Eigh");
-    OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors",
-                   "Eigh");
-
-    auto input_dim = ctx->GetInputDim("X");
-    auto rank = input_dim.size();
-
-    PADDLE_ENFORCE_GE(rank, 2,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) should have at least 2 dimensions."
-                          "But received a %d dimension tensor.",
-                          rank));
-    PADDLE_ENFORCE_EQ(
-        input_dim[rank - 2], input_dim[rank - 1],
-        platform::errors::InvalidArgument(
-            "Eigh op is designed for square matrix, consequently"
-            "inner-most 2 dimensions of Input(X) should be symmetric."
-            "But received X's shape[-2] = %d and shape[-1] = %d.",
-            input_dim[rank - 2], input_dim[rank - 1]));
-
-    std::vector<int64_t> values_dim;
-
-    for (auto i = 0; i < rank - 1; i++) {
-      values_dim.emplace_back(input_dim[i]);
-    }
-
-    ctx->SetOutputDim("Eigenvalues", phi::make_ddim(values_dim));
-    ctx->SetOutputDim("Eigenvectors", input_dim);
-  }
 };
 
-class EignOpMaker : public framework::OpProtoAndCheckerMaker {
+class EighOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
@@ -140,24 +110,11 @@ class EighGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(eigh, EighInferShapeFunctor,
+                            PD_INFER_META(phi::EighInferMeta));
 
-REGISTER_OPERATOR(eigh, ops::EighOp, ops::EignOpMaker,
+REGISTER_OPERATOR(eigh, ops::EighOp, ops::EighOpMaker,
                   ops::EighGradOpMaker<paddle::framework::OpDesc>,
-                  ops::EighGradOpMaker<paddle::imperative::OpBase>);
+                  ops::EighGradOpMaker<paddle::imperative::OpBase>,
+                  EighInferShapeFunctor);
 REGISTER_OPERATOR(eigh_grad, ops::EighGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    eigh, ops::EighKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::EighKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    eigh_grad, ops::EighGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigh_op.cu b/paddle/fluid/operators/eigh_op.cu
deleted file mode 100644
index 827c551637d4df..00000000000000
--- a/paddle/fluid/operators/eigh_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/eigh_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    eigh, ops::EighKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::EighKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    eigh_grad, ops::EighGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::EighGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h
deleted file mode 100644
index 5279ec750935c9..00000000000000
--- a/paddle/fluid/operators/eigh_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/eigen_values_vectors.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class EighKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<Tensor>("X");
-    auto output_w = ctx.Output<Tensor>("Eigenvalues");
-    auto output_v = ctx.Output<Tensor>("Eigenvectors");
-    std::string lower = ctx.Attr<std::string>("UPLO");
-    bool is_lower = (lower == "L");
-    math::MatrixEighFunctor<DeviceContext, T> functor;
-    functor(ctx, *input, output_w, output_v, is_lower, true);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class EighGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using ValueType = phi::dtype::Real<T>;
-    auto& x_grad = *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    x_grad.mutable_data<T>(ctx.GetPlace());
-    auto& output_w = *ctx.Input<Tensor>("Eigenvalues");
-    auto& output_v = *ctx.Input<Tensor>("Eigenvectors");
-    auto& output_w_grad =
-        *ctx.Input<Tensor>(framework::GradVarName("Eigenvalues"));
-    auto& output_v_grad =
-        *ctx.Input<Tensor>(framework::GradVarName("Eigenvectors"));
-
-    auto& dims = output_v.dims();
-    const int m = dims[dims.size() - 1];
-    auto dito =
-        math::DeviceIndependenceTensorOperations<DeviceContext, T, ValueType>(
-            ctx);
-    auto tV = dito.Transpose(dito.Conj(output_v));
-    auto W = dito.template Sub<ValueType>(dito.Unsqueeze(output_w, -2),
-                                          dito.Unsqueeze(output_w, -1));
-    Tensor result = dito.Matmul(tV, output_v_grad);
-    result.mutable_data<T>(dims, ctx.GetPlace());
-    std::vector<int> out_shape = phi::vectorize<int>(dims);
-    auto constant = dito.Fill(out_shape, 0.5);
-    result = dito.Sub(result, dito.Conj(dito.Transpose(result)));
-    result = dito.Mul(result, constant);
-    result = dito.Div(result, W);
-    result = dito.DiagFill(m, m, m, 0, output_w_grad, result);
-    x_grad = dito.Matmul(output_v, dito.Matmul(result, tV));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 38cd232e4d1d22..13fd9b81a8765a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -102,42 +102,6 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad,
                   ops::ElementwiseDoubleGradOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-
 REGISTER_OP_VERSION(elementwise_div)
     .AddCheckpoint(
         R"ROC(Register elementwise_div for adding the attribute of Scale_y)ROC",
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
deleted file mode 100644
index 9eb4b0352e5337..00000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  const auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  const auto place = ctx.GetPlace();
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, out, y};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dx, DivGradXFunctor<T>());
-  } else if (dy != nullptr && dx == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, out, y};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor<T>());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::bfloat16>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::float16>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index c58a7f36548a57..e9adb9abdb528c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -20,142 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
-void default_elementwise_sub(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y, framework::Tensor* z) {
-  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          SubFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseSubFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseSubFunctor<T>(), z);
-  }
-}
-
-template <typename DeviceContext, typename T>
-void default_elementwise_div(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y, framework::Tensor* z) {
-  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          DivFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseDivFunctor<T>(), z);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    auto pt_x = paddle::experimental::MakePhiDenseTensor(*x);
-    auto pt_y = paddle::experimental::MakePhiDenseTensor(*y);
-    auto pt_z = paddle::experimental::MakePhiDenseTensor(*z);
-    phi::DivideRawKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *pt_x.get(), *pt_y.get(), axis, pt_z.get());
-  }
-};
-
-template <typename T>
-struct DivGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
-};
-
-template <typename T>
-struct DivGradDX<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> y_conj(y.real, -y.imag);
-    return dout / y_conj;
-  }
-};
-
-template <typename T>
-struct DivGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return -dout * out / y;
-  }
-};
-
-template <typename T>
-struct DivGradDY<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> out_div_y_conj((out / y).real,
-                                                -(out / y).imag);
-    return -dout * out_div_y_conj;
-  }
-};
-
-template <typename T>
-struct DivDoubleDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return y * out * dout - x * dout;
-  }
-};
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-
-  ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseDivGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy);
-#endif
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    ElementwiseDivGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-  }
-};
-
 class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -206,80 +70,5 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseDivDoubleGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::Tensor;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* Y = ctx.Input<Tensor>("Y");
-    auto* Out = ctx.Input<Tensor>("Out");
-    auto* ddX = ctx.Input<Tensor>("DDX");
-    auto* ddY = ctx.Input<Tensor>("DDY");
-    auto* dX = ctx.Input<Tensor>("DX");
-
-    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* dOut = ctx.Output<Tensor>("DOut");
-    auto* ddOut = ctx.Output<Tensor>("DDOut");
-
-    int axis = ctx.Attr<int>("axis");
-
-    if (dY) dY->mutable_data<T>(Y->dims(), ctx.GetPlace());
-    if (dOut) dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    // ddX_safe == null ? 0 : ddX
-    // ddY_safe == null ? 0 : ddY
-    Tensor ddX_safe, ddY_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, dX, ddX, &ddX_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, Y, ddY, &ddY_safe);
-
-    // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-    // dY = Out * dX * ddY / Y - dX * ddX / Y
-    // dOut = - dX * ddY
-    // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can
-    // inplace ddx
-    Tensor tmp;
-    if (dOut) {
-      tmp = *dOut;
-    } else {
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      tmp = ctx.AllocateTmpTensor<T, DeviceContext>(Out->dims(), dev_ctx);
-    }
-    if (dY) {
-      // dX_div_Y = dX / Y;
-      Tensor dX_div_Y = tmp;
-      default_elementwise_div<DeviceContext, T>(ctx, dX, Y, &dX_div_Y);
-
-      // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
-      // first output tensor is nullptr, the branch to calculate first
-      // output tensor will not be activated, DivGradDx function will not
-      // be called and can be ignored, the first branch has little effect
-      // on running speed.
-
-      // dY = Out * dX * ddY / Y - dX * ddX / Y
-      ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivDoubleDY<T>>(
-          ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY,
-          DivGradDX<T>(), DivDoubleDY<T>());
-    }
-
-    if (ddOut) {
-      // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-      default_elementwise_mul<DeviceContext, T>(ctx, Out, &ddY_safe, &tmp);
-      default_elementwise_sub<DeviceContext, T>(ctx, &ddX_safe, &tmp, &tmp);
-      default_elementwise_div<DeviceContext, T>(ctx, &tmp, Y, ddOut);
-    }
-
-    if (dOut) {
-      // dOut = - dX * ddY
-      default_elementwise_mul<DeviceContext, T>(ctx, dX, &ddY_safe, dOut);
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      auto dout = framework::EigenVector<T>::Flatten(*dOut);
-      dout.device(place) = static_cast<T>(-1) * dout;
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 86f5be3071c2d1..14baeaa74d2421 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -90,67 +90,6 @@ struct MinFunctor {
 template <typename T>
 using Complex = paddle::platform::complex<T>;
 
-template <typename InT, typename OutT>
-struct DivGradXYFunctor {
-  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a, const InT b,
-                                                   const InT c) {
-    // dx = dout / y
-    // dy = - dout * out / y
-    phi::Array<OutT, 2> outs;
-    outs[0] = a / c;
-    outs[1] = -a * b / c;
-    return outs;
-  }
-};
-
-template <typename InT, typename OutT>
-struct DivGradXYFunctor<Complex<InT>, Complex<OutT>> {
-  inline HOSTDEVICE phi::Array<Complex<OutT>, 2> operator()(
-      const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
-    phi::Array<Complex<OutT>, 2> outs;
-    Complex<InT> c_conj(c.real, -c.imag);
-    Complex<InT> out_div_c_conj((b / c).real, -(b / c).imag);
-    outs[0] = a / c_conj;
-    outs[1] = -a * out_div_c_conj;
-    return outs;
-  }
-};
-
-// Float div grad
-template <typename T>
-struct DivGradXFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
-};
-
-// Complex div grad
-template <typename T>
-struct DivGradXFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b) const {
-    Complex<T> b_conj(b.real, -b.imag);
-    return a / b_conj;
-  }
-};
-
-// Float mul and div
-template <typename T>
-struct DivGradYFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b, const T c) const {
-    return -a * b / c;
-  }
-};
-
-// Complex mul and div
-template <typename T>
-struct DivGradYFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b,
-                                          const Complex<T> c) const {
-    Complex<T> out_div_c_conj((b / c).real, -(b / c).imag);
-    return -a * out_div_c_conj;
-  }
-};
-
 // Fmax
 template <typename T>
 struct FMaxFunctor {
@@ -257,47 +196,6 @@ struct MinGradXYFunctor {
   }
 };
 
-template <typename T>
-struct MulGradFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
-};
-template <typename T>
-struct MulGradFunctor<Complex<T>> {
-  inline HOSTDEVICE Complex<T> operator()(const Complex<T> a,
-                                          const Complex<T> b) const {
-    Complex<T> b_conj(b.real, -b.imag);
-    return a * b_conj;
-  }
-};
-
-template <typename InT, typename OutT>
-struct MulGradXYFunctor {
-  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a, const InT b,
-                                                   const InT c) {
-    phi::Array<OutT, 2> outs;
-    // dx = dout * y
-    outs[0] = a * b;
-    // dy = dout * x
-    outs[1] = a * c;
-    return outs;
-  }
-};
-
-template <typename InT, typename OutT>
-struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
-  inline HOSTDEVICE phi::Array<Complex<OutT>, 2> operator()(
-      const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
-    phi::Array<Complex<OutT>, 2> outs;
-    // dx = dout * y
-    Complex<InT> b_conj(b.real, -b.imag);
-    outs[0] = a * b_conj;
-    // dy = dout * x
-    Complex<InT> c_conj(c.real, -c.imag);
-    outs[1] = a * c_conj;
-    return outs;
-  }
-};
-
 // Ternary compare
 template <typename T>
 struct MaxGradXFunctor {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index e172279145e28c..830e09eeae4811 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -173,55 +173,6 @@ REGISTER_OP_CPU_KERNEL(
                               paddle::platform::complex<float>>,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext,
                               paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::bfloat16>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        bool>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_triple_grad,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        bool>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::bfloat16>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseMulTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(elementwise_mul)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 45c87a27a180af..f7b9fd1e265f5d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -63,33 +63,6 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
   }
 };
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  const auto& dev_ctx =
-      ctx.template device_context<platform::CUDADeviceContext>();
-  const auto place = ctx.GetPlace();
-
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y, x};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dx, MulGradFunctor<T>());
-  } else if (dx == nullptr && dy != nullptr) {
-    std::vector<const framework::Tensor*> ins = {dout, x};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
-                                                dy, MulGradFunctor<T>());
-  }
-}
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -103,44 +76,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::bfloat16>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<float>>,
     ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<float>>,
-    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::bfloat16>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_triple_grad,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, bool>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::bfloat16>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseMulTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index c81266d584468f..58a3123c7e332f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -137,244 +137,6 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     }
   }
 };
-template <typename T>
-struct MulGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
-};
-
-template <typename T>
-struct MulGradDX<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> y_conj(y.real, -y.imag);
-    return dout * y_conj;
-  }
-};
-
-template <typename T>
-struct MulGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
-};
-
-template <typename T>
-struct MulGradDY<paddle::platform::complex<T>> {
-  HOSTDEVICE paddle::platform::complex<T> operator()(
-      paddle::platform::complex<T> x, paddle::platform::complex<T> y,
-      paddle::platform::complex<T> out,
-      paddle::platform::complex<T> dout) const {
-    paddle::platform::complex<T> x_conj(x.real, -x.imag);
-    return dout * x_conj;
-  }
-};
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-ElementwiseMulGrad(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* x, const framework::Tensor* y,
-                   const framework::Tensor* out, const framework::Tensor* dout,
-                   framework::Tensor* dx, framework::Tensor* dy);
-#endif
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = dout;  // out is not necessary
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    ElementwiseMulGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-
-    if (ddout) ddout->mutable_data<T>(ctx.GetPlace());
-
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-    // dx = dout * ddy
-    // dy = dout * ddx
-    // ddout = ddx * y + x * ddy
-    // change computation sequence to save memory, so ddout can inplace ddx and
-    // dx can be used as 'tmp' tensor
-    // (1) dx = x * ddy
-    // (2) dy = dout * ddx
-    // (3) ddout = ddx * y
-    // (4) ddout = ddout + dx
-    // (5) dx = dout * ddy
-    if (ddout) {
-      int axis = ctx.Attr<int>("axis");
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace
-      if (ddout->numel() > ddx->numel()) {
-        ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-            ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX<T>(),
-            MulGradDY<T>());
-
-        Tensor ddout_tmp;
-        ddout_tmp.mutable_data<T>(ddout->dims(), ctx.GetPlace());
-
-        default_elementwise_mul<DeviceContext, T>(ctx, y, &ddx_safe, ddout);
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, x,
-                                                  &ddout_tmp);
-
-        auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-        auto ddout_tmp_t = framework::EigenVector<T>::Flatten(ddout_tmp);
-        ddout_t.device(place) = ddout_t + ddout_tmp_t;
-      } else {
-        // use dx to save memory, other than alloc tmp tensor
-        Tensor* ddout_tmp = dx;
-
-        default_elementwise_mul<DeviceContext, T>(ctx, x, &ddy_safe, ddout_tmp);
-        // NOTE: in the following ElemwiseGradCompute, for the
-        // first output tensor is nullptr, the branch to calculate first
-        // output tensor will not be activated, DivGradDx function will not
-        // be called and can be ignored, the first branch has little effect
-        // on running speed.
-        ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-            ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy,
-            MulGradDX<T>(), MulGradDY<T>());
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddx_safe, y, ddout);
-
-        auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-        auto ddout_tmp_t = framework::EigenVector<T>::Flatten(*ddout_tmp);
-        ddout_t.device(place) = ddout_t + ddout_tmp_t;
-        default_elementwise_mul<DeviceContext, T>(ctx, dout, &ddy_safe, dx);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulTripleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-    // get input
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>("DOut");
-    auto* ddx = ctx.Input<framework::Tensor>("DDX");
-    auto* ddy = ctx.Input<framework::Tensor>("DDY");
-
-    auto* d_dx = ctx.Input<framework::Tensor>("D_DX");
-    auto* d_dy = ctx.Input<framework::Tensor>("D_DY");
-    auto* d_ddout = ctx.Input<framework::Tensor>("D_DDOut");
-
-    // get output
-    auto* out_d_x = ctx.Output<framework::Tensor>("D_X");
-    auto* out_d_y = ctx.Output<framework::Tensor>("D_Y");
-    auto* out_d_dout = ctx.Output<framework::Tensor>("D_DOut");
-
-    auto* out_d_ddx = ctx.Output<framework::Tensor>("D_DDX");
-    auto* out_d_ddy = ctx.Output<framework::Tensor>("D_DDY");
-
-    if (out_d_x) out_d_x->mutable_data<T>(x->dims(), ctx.GetPlace());
-    if (out_d_y) out_d_y->mutable_data<T>(y->dims(), ctx.GetPlace());
-    if (out_d_dout) out_d_dout->mutable_data<T>(dout->dims(), ctx.GetPlace());
-    if (out_d_ddx) out_d_ddx->mutable_data<T>(x->dims(), ctx.GetPlace());
-    if (out_d_ddy) out_d_ddy->mutable_data<T>(y->dims(), ctx.GetPlace());
-
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-    if (d_ddout) {
-      if (out_d_x) {
-        // out_d_x = ddy * d_ddout
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, d_ddout,
-                                                  out_d_x);
-      }
-      if (out_d_y) {
-        // out_d_y = ddx * d_ddout
-        default_elementwise_mul<DeviceContext, T>(ctx, &ddx_safe, d_ddout,
-                                                  out_d_y);
-      }
-    }
-
-    if (out_d_dout) {
-      // get out_d_dout
-      // out_d_dout = ddy * d_dx + d_dy * ddx
-      Tensor out_d_dout_tmp;
-      out_d_dout_tmp.mutable_data<T>(dout->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, d_dy, &ddx_safe,
-                                                out_d_dout);
-      default_elementwise_mul<DeviceContext, T>(ctx, &ddy_safe, d_dx,
-                                                &out_d_dout_tmp);
-      auto out_d_dout_t = framework::EigenVector<T>::Flatten(*out_d_dout);
-      auto out_d_dout_tmp_t =
-          framework::EigenVector<T>::Flatten(out_d_dout_tmp);
-      out_d_dout_t.device(place) = out_d_dout_t + out_d_dout_tmp_t;
-    }
-
-    if (out_d_ddx) {
-      // get out_d_ddx
-      // out_d_ddx = dout * d_dy + y * d_ddout
-      Tensor out_d_ddx_tmp;
-      out_d_ddx_tmp.mutable_data<T>(ddx->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, dout, d_dy, out_d_ddx);
-      default_elementwise_mul<DeviceContext, T>(ctx, y, d_ddout,
-                                                &out_d_ddx_tmp);
-      auto out_d_ddx_t = framework::EigenVector<T>::Flatten(*out_d_ddx);
-      auto out_d_ddx_tmp_t = framework::EigenVector<T>::Flatten(out_d_ddx_tmp);
-      out_d_ddx_t.device(place) = out_d_ddx_t + out_d_ddx_tmp_t;
-    }
-
-    if (out_d_ddy) {
-      // get out_d_ddy
-      // out_d_ddy = dout * d_dx + x * d_ddout
-      Tensor out_d_ddy_tmp;
-      out_d_ddy_tmp.mutable_data<T>(ddy->dims(), ctx.GetPlace());
-      default_elementwise_mul<DeviceContext, T>(ctx, dout, d_dx, out_d_ddy);
-      default_elementwise_mul<DeviceContext, T>(ctx, x, d_ddout,
-                                                &out_d_ddy_tmp);
-      auto out_d_ddy_t = framework::EigenVector<T>::Flatten(*out_d_ddy);
-      auto out_d_ddy_tmp_t = framework::EigenVector<T>::Flatten(out_d_ddy_tmp);
-      out_d_ddy_t.device(place) = out_d_ddy_t + out_d_ddy_tmp_t;
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 61862aa9f87408..80b07721f0b4d1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -45,6 +45,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
 
 #endif
 
@@ -145,17 +146,9 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
                          const framework::Tensor &dout, int axis,
                          framework::Tensor *dx, framework::Tensor *dy,
                          DX_OP dx_op, DY_OP dy_op) {
-  const framework::DDim &x_dim = x.dims();
-  const framework::DDim &y_dim = y.dims();
   const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  if (x.dims() == y.dims()) {
-    phi::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP,
-                                               Tout>(
-        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-  } else {
-    phi::funcs::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
-        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-  }
+  phi::funcs::ElemwiseGradCompute<DeviceContext, T, DX_OP, DY_OP, Tout>(
+      dev_ctx, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
 }
 
 // It is a common implementation to compute binary calculation with the support
@@ -1174,14 +1167,6 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
 }
 
 #if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
-                   framework::Tensor *src, framework::Tensor *dst) {
-  std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
-  TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
-      dev_ctx.stream());
-}
 
 template <ElementwiseType ET, typename T, typename Functor>
 void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
@@ -1189,36 +1174,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
                      std::vector<const framework::Tensor *> ins,
                      const framework::Tensor *dout, framework::Tensor *dx,
                      framework::Tensor *dy, Functor func) {
-  framework::Tensor tmp_dx;
-  framework::Tensor tmp_dy;
-  dx->mutable_data<T>(place);
-  dy->mutable_data<T>(place);
-  std::vector<framework::Tensor *> outs;
-  if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) {
-    outs = {dx, dy};
-  } else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
-    tmp_dx.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dx, dy};
-  } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
-    tmp_dy.mutable_data<T>(dout->dims(), place);
-    outs = {dx, &tmp_dy};
-  } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
-    tmp_dy.mutable_data<T>(dout->dims(), place);
-    tmp_dx.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dx, &tmp_dy};
-  }
-
-  paddle::operators::LaunchElementwiseCudaKernel<ET, T, T, decltype(func), 2>(
-      dev_ctx, ins, &outs, axis, func);
-
-  if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
-  } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
-  } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
-  }
+  phi::GetGradXAndYOut<ET, T, Functor>(dev_ctx, place, axis, ins, *dout, dx, dy,
+                                       func);
 }
 
 template <ElementwiseType ET, typename T, typename Functor>
@@ -1227,22 +1184,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx,
                     std::vector<const framework::Tensor *> ins,
                     const framework::Tensor *dout, framework::Tensor *dxy,
                     Functor func) {
-  framework::Tensor tmp_dxy;
-  dxy->mutable_data<T>(place);
-
-  std::vector<framework::Tensor *> outs;
-  if (dxy->dims() != dout->dims()) {
-    tmp_dxy.mutable_data<T>(dout->dims(), place);
-    outs = {&tmp_dxy};
-  } else {
-    outs = {dxy};
-  }
-
-  paddle::operators::LaunchElementwiseCudaKernel<ET, T, T>(dev_ctx, ins, &outs,
-                                                           axis, func);
-  if (dxy->dims() != dout->dims()) {
-    ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
-  }
+  phi::GetGradXOrYOut<ET, T, Functor>(dev_ctx, place, axis, ins, *dout, dxy,
+                                      func);
 }
 
 #endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index fc128a88f2096a..3e9263fe93acd9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
index 9aa206efed8c01..7890d634e99417 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
@@ -28,7 +28,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
-USE_OP(elementwise_div);
+USE_OP_ITSELF(elementwise_div);
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index 6baa504562e76f..9e0e4e7fe1c6d2 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -88,11 +88,8 @@ class EmptyOpVarTypeInference : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-DELCARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor,
-                            PT_INFER_META(phi::CreateInferMeta));
-
-REGISTER_OPERATOR(
-    empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    EmptyInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor,
+                            PD_INFER_META(phi::CreateInferMeta));
+REGISTER_OP_WITHOUT_GRADIENT(empty, ops::EmptyOp, ops::EmptyOpMaker,
+                             ops::EmptyOpVarTypeInference,
+                             EmptyInferShapeFunctor);
diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc
index 3d409b4c4f6772..374b00792622f9 100644
--- a/paddle/fluid/operators/erfinv_op.cc
+++ b/paddle/fluid/operators/erfinv_op.cc
@@ -73,8 +73,8 @@ DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(
     erfinv, paddle::operators::ErfinvOp, paddle::operators::ErfinvOpMaker,
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
index cdd4e1dbaae6a6..df00ae54c1036b 100644
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc
index f8c6b4eb8c5e09..537c218d357b67 100644
--- a/paddle/fluid/operators/eye_op.cc
+++ b/paddle/fluid/operators/eye_op.cc
@@ -67,8 +67,8 @@ Return an identity tensor whose shape is [num_rows, num_columns].
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor,
-                            PT_INFER_META(phi::EyeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor,
+                            PD_INFER_META(phi::EyeInferMeta));
 
 REGISTER_OPERATOR(
     eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference,
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index 79018f2a97448a..cb03add3143278 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
       tensor_value.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_value, value);
       NpuOpRunner runner;
-#if (CANN_VERSION_CODE >= 503003)
+#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504001)
       runner.SetType("FillD")
           .AddInput(tensor_value)
           .AddOutput(*out_var)
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
new file mode 100644
index 00000000000000..508730c3c7335d
--- /dev/null
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
@@ -0,0 +1,655 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
+
+#if defined(PADDLE_WITH_CUDA)
+#include <cooperative_groups.h>
+#endif
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <cstring>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/operators/filter_by_instag_op.h"
+
+#if defined(PADDLE_WITH_CUDA)
+namespace cg = cooperative_groups;
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+using Vector = framework::Vector<T>;
+
+#define WARP_SIZE 32
+#define MAX_WARP_NUM 32
+
+#if defined(PADDLE_WITH_CUDA)
+
+template <typename T>
+__global__ void filter_copy_fuse_kernel(
+    const size_t N, const int ins_per_thread, size_t* x1_lods_data,
+    size_t* x2_lods_data, const int64_t* x2_data, const int64_t* x3_data,
+    int64_t filter_tag_size, T* out_data, int64_t* map_data,
+    size_t* map_lods_data, size_t* out_lods_data, size_t* out_idx_data,
+    const T* x1_data, int x1_embed_size, float* loss_weight_data,
+    float fill_value) {
+  // N is instance num
+  // one threads for ins_per_thread instances
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  cg::thread_block b = cg::this_thread_block();
+  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+
+  int gid = idx / WARP_SIZE;
+
+  // general use
+  int thread_num =
+      (N + (ins_per_thread - 1)) / ins_per_thread;  // real thread num
+  int total_warp_num = thread_num / WARP_SIZE;      // 30
+  int remain_thread_num = thread_num % WARP_SIZE;   // 16
+
+  int warp_thread_num = -1;
+  if (gid < total_warp_num) {
+    warp_thread_num = WARP_SIZE;
+  } else {
+    warp_thread_num = remain_thread_num;
+  }
+
+  int group_num = total_warp_num;
+  if (remain_thread_num > 0) {
+    group_num = total_warp_num + 1;
+  }
+
+  if (gid >= group_num) return;
+
+  int ins_start = idx * ins_per_thread;
+  int ins_end = (idx + 1) * ins_per_thread;
+
+  if (N < ins_end) ins_end = N;
+
+  /*
+    if (!x1_lods_filled) {
+      for (int p = ins_start; p < ins_end; p++) {
+        x1_lods_data[p] = p;
+      }
+      if (idx == 0) {
+        x1_lods_data[N] = N;
+      }
+    }
+
+    if (!x2_lods_filled) {
+      for (int p = ins_start; p < ins_end; p++) {
+        x2_lods_data[p] = p;
+      }
+      if (idx == 0) {
+        x2_lods_data[N] = N;
+      }
+    }
+
+    if (!x1_lods_filled || !x2_lods_filled) {
+      b.sync();
+    }
+  */
+
+  int flag_data[5];
+  int prefix_sum_data[5];
+  int prefix_sum_data2[5];
+
+  __shared__ int shr[MAX_WARP_NUM];
+  __shared__ int shr2[MAX_WARP_NUM];
+  __shared__ int shr3[MAX_WARP_NUM];
+
+  for (int p = ins_start; p < ins_end; p++) {
+    int ins_tag_start = x2_lods_data[p];
+    int ins_tag_end = x2_lods_data[p + 1];
+    flag_data[p - ins_start] = 0;
+    // filter logic
+    int i = ins_tag_start;
+    for (; i < ins_tag_end; i++) {
+      int64_t ins_tag = x2_data[i];
+      int j = 0;
+      for (; j < filter_tag_size; j++) {
+        if (x3_data[j] == ins_tag) break;
+      }
+      // if ins_tag in filter tag
+      if (j < filter_tag_size) {
+        flag_data[p - ins_start] = 1;
+        break;
+      }
+    }
+  }
+
+  int sum_addr = 0;
+  int sum_flag = 0;
+  int sum_out_lods = 0;
+
+  int local_addr = 0;
+  int local_flag = 0;
+  int local_out_lods = 0;
+
+  if (ins_start < ins_end) {
+    for (int p = ins_start; p < ins_end; p++) {
+      int previous = -1;
+      if (p == ins_start) {
+        previous = 0;
+      } else {
+        previous = prefix_sum_data[p - ins_start - 1];
+      }
+
+      prefix_sum_data[p - ins_start] =
+          previous +
+          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
+    }
+
+    local_addr = prefix_sum_data[ins_end - 1 - ins_start];
+    sum_addr = local_addr;
+
+    // flag
+    // local_flag = 0;
+    for (int p = ins_start; p < ins_end; p++) {
+      local_flag += flag_data[p - ins_start];
+    }
+    sum_flag = local_flag;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      local_out_lods +=
+          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
+    }
+
+    sum_out_lods = local_out_lods;
+  }
+
+  // 32 threads
+  for (int i = 1; i < warp_thread_num; i *= 2) {
+    int temp_addr = g.shfl_up(sum_addr, i);
+    int temp_flag = g.shfl_up(sum_flag, i);
+    int temp_out_lods = g.shfl_up(sum_out_lods, i);
+
+    if (g.thread_rank() >= i) {
+      sum_addr += temp_addr;
+      sum_flag += temp_flag;
+      sum_out_lods += temp_out_lods;
+    }
+  }
+
+  if (g.thread_rank() == warp_thread_num - 1) {
+    shr[gid] = sum_addr;
+    shr2[gid] = sum_flag;
+    shr3[gid] = sum_out_lods;
+  }
+
+  b.sync();
+
+  int sum_addr2 = 0;
+  int sum_flag2 = 0;
+  int sum_out_lods2 = 0;
+
+  // communicate between warp
+  if (g.thread_rank() < group_num) {
+    sum_addr2 = shr[g.thread_rank()];
+    sum_flag2 = shr2[g.thread_rank()];
+    sum_out_lods2 = shr3[g.thread_rank()];
+  }
+
+  for (int i = 1; i < group_num; i *= 2) {
+    int temp_addr2 = g.shfl_up(sum_addr2, i);
+    int temp_flag2 = g.shfl_up(sum_flag2, i);
+    int temp_out_lods2 = g.shfl_up(sum_out_lods2, i);
+
+    if (g.thread_rank() >= i) {
+      sum_addr2 += temp_addr2;
+      sum_flag2 += temp_flag2;
+      sum_out_lods2 += temp_out_lods2;
+    }
+  }
+
+  int sum_addr3 = g.shfl(sum_addr2, gid);
+  int sum_flag3 = g.shfl(sum_flag2, gid);
+  int sum_out_lods3 = g.shfl(sum_out_lods2, gid);
+
+  int p_flag;
+  int p_addr;
+  int p_out_lods;
+
+  if (ins_start < ins_end) {
+    p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr;
+    p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag;
+    p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods;
+
+    for (int p = ins_start; p < ins_end; p++) {
+      if (ins_start == p) {
+        prefix_sum_data2[p - ins_start] = p_addr;
+      } else {
+        prefix_sum_data2[p - ins_start] =
+            prefix_sum_data2[p - ins_start - 1] +
+            flag_data[p - ins_start - 1] *
+                (x1_lods_data[p] - x1_lods_data[p - 1]);
+      }
+    }
+
+    if (gid == 0 && g.thread_rank() == group_num - 1) {
+      *out_idx_data = (sum_flag2 + 1);
+      map_lods_data[sum_flag2] = sum_flag2;
+    }
+  }
+
+  int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1);
+
+  if (ins_start < ins_end) {
+    int out_lods_idx = p_flag + 1;
+
+    // ins_start = 1
+    // BUG fix
+    for (int p = ins_start; p < ins_end; p++) {
+      if (flag_data[p - ins_start] == 1) {
+        // batch_len = 2
+        // batch_len = 4
+        size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p];
+        // t = 0
+        // t = 1
+        int t = out_lods_idx - 1;
+        // out_lods_data[0] = 0;
+        int previous;
+
+        if (out_lods_idx == p_flag + 1) {
+          // out_lods_data[t] = p_out_lods;
+          previous = p_out_lods;
+        } else {
+          previous = out_lods_data[t];
+        }
+
+        map_data[t * 3] = (int64_t)previous;
+        map_data[t * 3 + 1] = x1_lods_data[p];
+        map_lods_data[t] = t;
+        out_lods_data[out_lods_idx] = previous + batch_len;
+        map_data[t * 3 + 2] = batch_len;
+        out_lods_idx++;
+      }
+    }
+
+    // fill loss_weight_data
+    if (sum_out_lods4 > 1) {
+      int out_data_num = sum_out_lods4 - 1;
+      int out_start = ins_start;
+
+      if (out_start < out_data_num) {
+        int out_end = ins_end >= out_data_num ? out_data_num : ins_end;
+        for (int p = out_start; p < out_end; p++) {
+          loss_weight_data[p] = fill_value;
+        }
+      }
+    }
+
+    for (int p = ins_start; p < ins_end; p++) {
+      // copy logic
+      if (flag_data[p - ins_start] == 1) {
+        auto output_start_idx = prefix_sum_data2[p - ins_start];
+        T* dst = out_data + output_start_idx * x1_embed_size;
+
+        const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size;
+        const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size;
+
+        // optimized
+        for (const T *j = src_start; j != src_end; dst++, j++) {
+          *dst = *j;
+        }
+      }
+    }
+  }
+
+  b.sync();
+}
+
+template <typename T>
+__global__ void copy_grad_kernel(const size_t N, const int ins_per_thread,
+                                 const T* out_grad_data, T* x1_grad_data,
+                                 const int64_t* map_data, int x1_embed_size) {
+  // N is instance num
+  // one threads for one instance
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int ins_start = idx * ins_per_thread;
+  int ins_end = (idx + 1) * ins_per_thread;
+
+  if (ins_start >= N) {
+    return;
+  }
+  if (ins_end > N) ins_end = N;
+
+  for (int p = ins_start; p < ins_end; p++) {
+    T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size;
+    const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size;
+    const T* src_end =
+        out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size;
+
+    for (const T *j = src_start; j != src_end; dst++, j++) {
+      *dst = *j;
+    }
+  }
+}
+
+#endif
+
+template <typename T>
+class FilterByInstagGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+#if defined(PADDLE_WITH_CUDA)
+
+    auto gpu_place = context.GetPlace();
+
+    gpuStream_t current_stream = context.cuda_device_context().stream();
+
+    int max_thread_num_per_block = 1024;
+    //    context.cuda_device_context().GetMaxThreadsPerBlock();
+    // X1 is global FC output
+    // Dim [batch size, embedding size]
+    const LoDTensor* x1 = context.Input<LoDTensor>("Ins");
+    bool is_lod = context.Attr<bool>("is_lod");
+
+    int is_x1_lod = -1;
+    if (is_lod)
+      is_x1_lod = 1;
+    else
+      is_x1_lod = 0;
+
+    int64_t out_val_if_empty = context.Attr<int64_t>("out_val_if_empty");
+    size_t x1_embed_size = x1->dims()[1];
+    // X2 is ins tag list
+    // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]]
+    const LoDTensor* x2 = context.Input<LoDTensor>("Ins_tag");
+    // expected auto = const int64_t
+    const int64_t* x2_data = x2->data<int64_t>();
+
+    // X3 is local fc tag list
+    // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
+    const Tensor* x3 = context.Input<Tensor>("Filter_tag");
+    const int64_t* x3_data = x3->data<int64_t>();
+
+    // int x2_lods_filled = 1;
+
+    Vector<size_t> x2_lods;
+    // Vector, in GPU
+    if (x2->lod().size() != 0) {  // lod_level = 1
+      x2_lods = x2->lod()[0];
+      // x2_lods_filled = 1;
+
+    } else {  // lod_level = 0
+      const size_t x2_lods_size = x2->dims()[0];
+      // x2_lods.resize(x2->dims()[0] + 1);
+      // move to cuda
+      x2_lods.push_back(0);
+      for (size_t i = 0; i < x2_lods_size; i++) {
+        x2_lods.push_back(i + 1);
+      }
+    }
+
+    const size_t x2_lods_size = x2_lods.size() - 1;
+    paddle::framework::MixVector<size_t> mixv_x2_lods(&x2_lods);
+
+    size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place);
+
+    // Vector, in GPU
+    // int x1_lods_filled = 1;
+    Vector<size_t> x1_lods;
+
+    if (!is_x1_lod) {
+      // move to cuda
+      // x1_lods.resize(x1->dims()[0] + 1);
+      x1_lods.push_back(0);
+      for (int i = 0; i < x1->dims()[0]; i++) {
+        x1_lods.push_back(i + 1);
+      }
+    } else {
+      // x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
+      // new: lod_level=0 => lod() return {}
+      if (x1->lod().size() != 0) {  // lod_level = 1
+        // x1_lods_filled = 1;
+        x1_lods = x1->lod()[0];
+      } else {  // lod_level = 0
+        // x1_lods.resize(x1->dims()[0] + 1);
+        // move to cuda
+        x1_lods.push_back(0);
+        for (int i = 0; i < x1->dims()[0]; i++) {
+          x1_lods.push_back(i + 1);
+        }
+      }
+    }
+
+    paddle::framework::MixVector<size_t> mixv_x1_lods(&x1_lods);
+
+    size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place);
+    auto* x1_data = x1->data<T>();
+
+    // set output value
+    // for those whose ins been dropout, set 0 for whole lines.
+    // otherwise, copy whole line
+    // Dim [local fc count, batch size, embedding size]
+    LoDTensor* out = context.Output<LoDTensor>("Out");
+    LoDTensor* map = context.Output<LoDTensor>("IndexMap");
+    LoDTensor* loss_weight = context.Output<LoDTensor>("LossWeight");
+
+    int out_first = x1_lods.back();
+    // int out_first = x1->dims()[0];
+    // if (x1_lods_filled) {
+    //  out_first = x1_lods.back();
+    // }
+
+    out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size}));
+    map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3}));
+    loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1}));
+
+    T* out_data = out->mutable_data<T>(gpu_place);
+    int64_t* map_data = map->mutable_data<int64_t>(gpu_place);
+    float* loss_weight_data = loss_weight->mutable_data<float>(gpu_place);
+
+    int block_size = max_thread_num_per_block;
+    int ins_per_thread = (x2_lods_size + block_size - 1) / block_size;
+    dim3 block_dim(block_size);
+    dim3 grid_dim(1);
+
+    Vector<size_t> out_lods(x2_lods_size + 1, 0);
+    Vector<size_t> map_lods(x2_lods_size + 1, 0);
+
+    paddle::framework::MixVector<size_t> mixv_out_lods(&out_lods);
+    paddle::framework::MixVector<size_t> mixv_map_lods(&map_lods);
+
+    // thrust::device_vector<size_t> out_idx(1);
+    Vector<size_t> out_idx(1, 0);
+    paddle::framework::MixVector<size_t> mixv_out_idx(&out_idx);
+
+    size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place);
+    size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place);
+    size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place);
+
+    float fill_value = 1.0;
+
+    filter_copy_fuse_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
+        x2_lods_size, ins_per_thread, x1_lods_data, x2_lods_data, x2_data,
+        x3_data, x3->numel(), out_data, map_data, map_lods_data, out_lods_data,
+        out_idx_data, x1_data, x1_embed_size, loss_weight_data, fill_value);
+
+    platform::GpuStreamSync(current_stream);
+
+    mixv_out_lods.resize(mixv_out_idx[0]);
+
+    if (mixv_out_lods.size() - 1 > 0) {
+      out->Resize(phi::make_ddim(
+          {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size}));
+
+      map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3}));
+      loss_weight->Resize(
+          phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1}));
+
+    } else {
+      out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size}));
+      map->Resize(phi::make_ddim({1, 3}));
+      loss_weight->Resize(phi::make_ddim({1, 1}));
+    }
+
+    if (mixv_out_lods.size() - 1 > 0) {
+      map_lods.resize(mixv_out_lods.size());
+
+      mixv_map_lods.CopyToCPU();
+
+      std::vector<Vector<size_t>> map_lod_info;
+      map_lod_info.emplace_back(map_lods);
+
+      map->set_lod(map_lod_info);
+      loss_weight->set_lod(map_lod_info);
+
+      mixv_out_lods.CopyToCPU();
+      std::vector<Vector<size_t>> out_lod_info;
+      out_lod_info.emplace_back(out_lods);
+      out->set_lod(out_lod_info);
+
+    } else {
+      Vector<size_t> map_lods(2, 0);
+      paddle::framework::MixVector<size_t> mixv_map_lods(&map_lods);
+      thrust::device_ptr<int64_t> map_data_ptr(map_data);
+
+      map_data_ptr[0] = 0;
+      map_data_ptr[1] = 1;
+      map_data_ptr[2] = 1;
+
+      mixv_map_lods[0] = 0;
+      mixv_map_lods[1] = 1;
+      mixv_out_lods.push_back(1);
+
+      mixv_map_lods.CopyToCPU();
+      mixv_out_lods.CopyToCPU();
+
+      std::vector<Vector<size_t>> map_lod_info;
+      map_lod_info.emplace_back(map_lods);
+      map->set_lod(map_lod_info);
+
+      loss_weight->set_lod(map_lod_info);
+
+      std::vector<Vector<size_t>> out_lod_info;
+      out_lod_info.emplace_back(out_lods);
+      out->set_lod(out_lod_info);
+
+      thrust::device_ptr<T> out_data_ptr(out_data);
+
+      // gpu kernel
+      if (std::is_same<T, int32_t>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<int32_t>(out_val_if_empty));
+      } else if (std::is_same<T, int64_t>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<int64_t>(out_val_if_empty));
+      } else if (std::is_same<T, float>::value) {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<float>(out_val_if_empty));
+      } else {
+        thrust::fill(out_data_ptr, out_data_ptr + out->numel(),
+                     static_cast<double>(out_val_if_empty));
+      }
+
+      thrust::device_ptr<float> loss_weight_data_ptr(loss_weight_data);
+      loss_weight_data_ptr[0] = 0;
+    }
+
+#endif
+  }
+};
+
+template <typename T>
+class FilterByInstagGradGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+#if defined(PADDLE_WITH_CUDA)
+
+    auto gpu_place = context.GetPlace();
+    gpuStream_t current_stream = context.cuda_device_context().stream();
+    auto max_thread_num_per_block = 1024;
+    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x1_grad = context.Output<LoDTensor>(framework::GradVarName("Ins"));
+    auto* loss_weight = context.Input<LoDTensor>("LossWeight");
+    auto* mmap = context.Input<LoDTensor>("IndexMap");
+    auto* x1 = context.Input<LoDTensor>("Ins");
+
+    x1_grad->set_lod(context.Input<LoDTensor>("Ins")->lod());
+    x1_grad->Resize(x1->dims());
+
+    auto* mmap_data = mmap->data<int64_t>();
+    // expected auto = T
+    auto* output_grad_data = output_grad->data<T>();
+    auto* loss_weight_data = loss_weight->data<float>();
+
+    // expected auto = T
+    auto* x1_grad_data = x1_grad->mutable_data<T>(gpu_place);
+    thrust::device_ptr<T> x1_grad_data_ptr(x1_grad_data);
+    thrust::device_ptr<const float> loss_weight_data_ptr(loss_weight_data);
+
+    thrust::fill(x1_grad_data_ptr,
+                 x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0);
+
+    if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) {
+      auto output_dims = output_grad->dims();
+      int x1_embed_size = output_dims[1];
+
+      // one thread for multi-instances
+      int block_size = max_thread_num_per_block;
+
+      size_t N = mmap->dims()[0];
+      dim3 block_dim(block_size);
+
+      dim3 grid_dim((N + block_size - 1) / block_size);
+
+      const int ins_per_thread = 1;
+
+      copy_grad_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
+          N, ins_per_thread, output_grad_data, x1_grad_data, mmap_data,
+          x1_embed_size);
+
+      cudaStreamSynchronize(current_stream);
+    }
+
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(filter_by_instag, ops::FilterByInstagGPUKernel<float>,
+                        ops::FilterByInstagGPUKernel<double>,
+                        ops::FilterByInstagGPUKernel<int32_t>,
+                        ops::FilterByInstagGPUKernel<int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(filter_by_instag_grad,
+                        ops::FilterByInstagGradGPUKernel<float>,
+                        ops::FilterByInstagGradGPUKernel<double>,
+                        ops::FilterByInstagGradGPUKernel<int32_t>,
+                        ops::FilterByInstagGradGPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index deb2aa96b539e3..3abc980ceaafc3 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -61,7 +61,20 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     // expected auto = const int64_t
     auto* x2_data = x2->data<int64_t>();
     // e.g get [0, 1, 2, 3, ...]
-    size_t x2_lods_size = x2->dims()[0];
+    // size_t x2_lods_size = x2->dims()[0];
+    // size_t instag_num_per_ins = x2->dims()[1];
+
+    Vector<size_t> x2_lods(1, 0);
+    if (x2->lod().size() != 0) {  // lod_level = 1
+      x2_lods = x2->lod()[0];
+    } else {  // lod_level = 0
+      const size_t x2_lods_size = x2->dims()[0];
+      const size_t instag_num_per_ins = x2->dims()[1];
+      for (size_t i = 0; i < x2_lods_size; i++) {
+        x2_lods.push_back(x2_lods.back() + instag_num_per_ins);
+      }
+    }
+
     Vector<size_t> x1_lods(1, 0);
     if (!is_x1_lod) {
       for (int i = 0; i < x1->dims()[0]; i++) {
@@ -79,8 +92,8 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     }
     std::unordered_map<int64_t, int64_t> mmap_aux;
     Vector<size_t> out_lods(1, 0);
-    for (size_t i = 0; i < x2_lods_size; i++) {
-      for (size_t j = i; j < i + 1; j++) {
+    for (size_t i = 0; i < x2_lods.size() - 1; i++) {
+      for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) {
         if (filter_tag.find(x2_data[j]) != filter_tag.end()) {
           size_t batch_len = x1_lods[i + 1] - x1_lods[i];
           mmap_aux[out_lods.back()] = x1_lods[i];
@@ -165,8 +178,10 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
           out_data[oi] = (int32_t)out_val_if_empty;
         } else if (std::is_same<T, int64_t>::value) {
           out_data[oi] = (int64_t)out_val_if_empty;
-        } else {
+        } else if (std::is_same<T, double>::value) {
           out_data[oi] = static_cast<double>(out_val_if_empty);
+        } else {
+          out_data[oi] = static_cast<float>(out_val_if_empty);
         }
       }
       loss_weight_data[0] = 0;
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 40ec9aef190ff4..92f59e118c3b7b 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -95,6 +95,17 @@ class FoldOp : public framework::OperatorWithKernel {
                           "but recieved strides_height: %d strides_width: %d.",
                           strides[0], strides[1]));
     // check dilations
+    PADDLE_ENFORCE_GT(output_height, 1,
+                      platform::errors::InvalidArgument(
+                          "The `output_height` should be greater than one, "
+                          "but recieved output_height: %d .",
+                          output_height));
+    PADDLE_ENFORCE_GT(output_width, 1,
+                      platform::errors::InvalidArgument(
+                          "The `output_width` should be greater than one, "
+                          "but recieved output_width: %d .",
+                          output_width));
+    // check output size
     PADDLE_ENFORCE_GT(
         dilation_height, 0,
         platform::errors::InvalidArgument(
@@ -146,7 +157,7 @@ class FoldOp : public framework::OperatorWithKernel {
             output_width));
 
     PADDLE_ENFORCE_EQ(
-        blocks_height * blocks_width, in_dims[1],
+        blocks_height * blocks_width, in_dims[2],
         platform::errors::InvalidArgument(
             "Given input output_size (%d, %d), "
             "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
@@ -156,6 +167,15 @@ class FoldOp : public framework::OperatorWithKernel {
             strides[0], strides[1], dilations[0], dilations[1], blocks_height,
             blocks_width, blocks_height * blocks_width, in_dims[2]));
 
+    PADDLE_ENFORCE_EQ(
+        in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), 0,
+        platform::errors::InvalidArgument(
+            "Expected size of input's dimension 1 to be divisible by the"
+            "product of kernel_size, but got input.size(1)=%d and "
+            "kernel_size=( %d"
+            ", %d).",
+            in_dims[1], kernel_sizes[0], kernel_sizes[1]));
+
     out_dims.push_back(output_height);
     out_dims.push_back(output_width);
     ctx->SetOutputDim("Y", phi::make_ddim(out_dims));
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 67287afa6ae505..80e7f5c001d4b8 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -19,7 +19,8 @@ register_operators(EXCLUDES
     fused_attention_op
     fused_transformer_op
     fused_feedforward_op
-    resnet_unit_op)
+    resnet_unit_op
+    fused_gemm_epilogue_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -79,4 +80,8 @@ if (WITH_GPU OR WITH_ROCM)
         cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory)
         cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory)
     endif()
+
+    if (CUDA_VERSION GREATER_EQUAL 11.6)
+        op_library(fused_gemm_epilogue_op)
+    endif()
 endif()
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index b3792a176fabeb..a80f590aa495db 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -405,8 +405,18 @@ TEST(CudnnNormConvFp16, K1S1) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 3, output_channels = input_channels
@@ -421,8 +431,18 @@ TEST(CudnnNormConvFp16, K3S1) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 1, output_channels = input_channels * 4
@@ -437,8 +457,18 @@ TEST(CudnnNormConvFp16, K1S1O4) {
   CudnnNormConvolutionTester<paddle::platform::float16> test(
       batch_size, height, width, input_channels, output_channels, kernel_size,
       stride);
-  test.CheckForward(1e-3, true);
-  test.CheckBackward(1e-3, true);
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+
+  if (ctx->GetComputeCapability() <= 70) {
+    ASSERT_THROW(test.CheckForward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+    ASSERT_THROW(test.CheckBackward(1e-3, true),
+                 paddle::platform::EnforceNotMet);
+  } else {
+    ASSERT_NO_THROW(test.CheckForward(1e-3, true));
+    ASSERT_NO_THROW(test.CheckBackward(1e-3, true));
+  }
 }
 
 // test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 02027767579735..3c9e16785eac81 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -140,9 +140,9 @@ class FMHARef {
 
     if (dropout_param_.dropout_prob_) {
       DropoutFwGPUKernelDriver<T>(
-          dev_ctx_, dropout_param_.is_test_,
-          static_cast<const std::string>(
-              dropout_param_.dropout_implementation_),
+          static_cast<const phi::GPUContext&>(dev_ctx_),
+          dropout_param_.is_test_, static_cast<const std::string>(
+                                       dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_,
           dropout_param_.is_fix_seed_, dropout_param_.seed_val_,
           static_cast<const Tensor&>(*softmax_out_tensor), dropout_param_.seed_,
@@ -242,8 +242,9 @@ class FMHARef {
     // dropout bw
     if (dropout_param_.dropout_prob_) {
       DropoutGradGPUKernelDriver<T>(
-          dev_ctx_, static_cast<const std::string>(
-                        dropout_param_.dropout_implementation_),
+          static_cast<const phi::GPUContext&>(dev_ctx_),
+          static_cast<const std::string>(
+              dropout_param_.dropout_implementation_),
           dropout_param_.dropout_prob_,
           static_cast<const Tensor&>(*dropout_out_grad_tensor),
           dropout_mask_out_tensor, softmax_out_grad_tensor->numel(),
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index d7952df470d815..18c7187fc8e64c 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -31,7 +31,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace memory = paddle::memory;
 
-USE_OP(dropout);
+USE_OP_ITSELF(dropout);
 USE_OP(layer_norm);
 
 template <typename T>
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index 0c8eae4260441f..f3f8f174275778 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -195,6 +195,8 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
     AddAttr<int>("dropout1_seed", "Dropout1 random seed.").SetDefault(0);
     AddAttr<int>("dropout2_seed", "Dropout2 random seed.").SetDefault(0);
+    AddAttr<int>("ring_id", "ring id for tensor model parallel.")
+        .SetDefault(-1);
     AddComment(R"DOC(
         the function of fused_feedforward operator is the same as the following pseudo code:
         residual = src;
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 3131269955bdd1..c38d9f7d4bcbd2 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -21,11 +21,39 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
+template <typename T>
+static void AllReduce(framework::Tensor& tensor,  // NOLINT
+                      const int ring_id,
+                      const platform::CUDADeviceContext& ctx) {
+  if (ring_id == -1) return;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto dtype =
+      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
+  int64_t numel = tensor.numel();
+  const void* sendbuff = tensor.data<T>();
+  auto place = ctx.GetPlace();
+  void* recvbuff = tensor.mutable_data<T>(place);
+  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+  auto stream = ctx.stream();
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
+      "parallel op."));
+#endif
+}
+
 template <typename DeviceContext, typename T>
 class FusedFeedForwardKernel : public framework::OpKernel<T> {
  public:
@@ -56,7 +84,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
            framework::Tensor* dropout1_out, framework::Tensor* dropout2_out,
            const int bsz_seq, const int d_model, const int dim_feedforward,
            const std::string& act_method, const bool pre_layer_norm,
-           const float epsilon1, const float epsilon2,
+           const float epsilon1, const float epsilon2, const int ring_id,
            const DropoutParam& dropout_param1,
            const DropoutParam& dropout_param2,
            const platform::CUDADeviceContext& ctx) const {
@@ -95,6 +123,10 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
     framework::Tensor linear2_out;
     linear2_out.mutable_data<T>({bsz_seq, d_model}, place);
     MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out);
+
+    // tensor model parallel
+    AllReduce<T>(linear2_out, ring_id, ctx);
+
     if (!pre_layer_norm) {
       fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
           ctx, linear2_out.data<T>(), x.data<T>(), linear2_bias_ptr,
@@ -150,6 +182,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
 
     const float epsilon1 = context.Attr<float>("ln1_epsilon");
     const float epsilon2 = context.Attr<float>("ln2_epsilon");
+    const int ring_id = context.Attr<int>("ring_id");
 
     DropoutParam dropout_param1(context, 1);
     DropoutParam dropout_param2(context, 2);
@@ -186,7 +219,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
         dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance,
         linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model,
         dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2,
-        dropout_param1, dropout_param2, context.cuda_device_context());
+        ring_id, dropout_param1, dropout_param2, context.cuda_device_context());
   }
 };
 
@@ -231,7 +264,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
       const int dim_feedforward, const DropoutParam& dropout_param1,
       const DropoutParam& dropout_param2, const std::string& act_method,
       const bool pre_layer_norm, const float epsilon1, const float epsilon2,
-      const platform::CUDADeviceContext& ctx) const {
+      const int ring_id, const platform::CUDADeviceContext& ctx) const {
     FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
         bsz_seq, d_model, epsilon1);
     FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
@@ -295,13 +328,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
       d_ln1_out.mutable_data<T>({bsz_seq, d_model}, place);
       MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out,
                  d_linear1_weight);
-
+      // tensor model parallel
+      AllReduce<T>(d_ln1_out, ring_id, ctx);
       pre_layernorm_helper.LayerNormGrad(
           ctx, d_ln1_out.data<T>(), x.data<T>(), ln1_gamma_ptr,
           ln1_mean->data<U>(), ln1_variance->data<U>(), d_x->data<T>(),
           d_ln1_gamma_ptr, d_ln1_beta_ptr);
     } else {
       MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight);
+      // tensor model parallel
+      AllReduce<T>(*d_x, ring_id, ctx);
     }
     std::vector<const Tensor*> ins(2);
     std::vector<Tensor*> outs(1);
@@ -376,6 +412,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
 
     const float epsilon1 = context.Attr<float>("ln1_epsilon");
     const float epsilon2 = context.Attr<float>("ln2_epsilon");
+    const int ring_id = context.Attr<int>("ring_id");
     const std::string act_method = context.Attr<std::string>("act_method");
     DropoutParam dropout_param1(context, 1);
     DropoutParam dropout_param2(context, 2);
@@ -419,7 +456,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
             d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale,
             d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model,
             dim_feedforward, dropout_param1, dropout_param2, act_method,
-            pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context());
+            pre_layer_norm, epsilon1, epsilon2, ring_id,
+            context.cuda_device_context());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
new file mode 100644
index 00000000000000..4c4e3661e6d6ed
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -0,0 +1,353 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasInput("Bias"), "Output", "Bias",
+                   "FusedGemmEpilogueOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FusedGemmEpilogueOp");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto bias_dims = ctx->GetInputDim("Bias");
+
+    auto trans_x = ctx->Attrs().Get<bool>("trans_x");
+    auto trans_y = ctx->Attrs().Get<bool>("trans_y");
+
+    PADDLE_ENFORCE_EQ(
+        y_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor Y's dimension of FusedGemmEpilogueOp "
+            " should be 2, but got %d.",
+            y_dims.size()));
+
+    PADDLE_ENFORCE_GE(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor X's dimension of FusedGemmEpilogueOp "
+            " should be >= 2, but got %d.",
+            x_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        bias_dims.size(), 1,
+        platform::errors::InvalidArgument(
+            "The Input tensor bias's dimension of FusedGemmEpilogueOp "
+            " should be == 1, but got %d.",
+            bias_dims.size()));
+
+    PADDLE_ENFORCE_EQ(bias_dims[0], trans_y ? y_dims[0] : y_dims[1],
+                      platform::errors::InvalidArgument(
+                          "The Input tensor bias's dimension 0"
+                          " should be == Y[-1], but got bias's shape = [%s] "
+                          "and Y's shape = [%s]",
+                          bias_dims, y_dims));
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1);
+
+    int K_from_x = trans_x ? x_mat_dims[0] : x_mat_dims[1];
+    int K_from_y = trans_y ? y_dims[1] : y_dims[0];
+
+    PADDLE_ENFORCE_EQ(
+        K_from_x, K_from_y,
+        platform::errors::InvalidArgument(
+            "The last dimension of X should be equal with Y's first dimension."
+            "But received X[-1] = [%d], Y[0] = [%d].",
+            K_from_x, K_from_y));
+
+    auto activation = ctx->Attrs().Get<std::string>("activation");
+
+    if ((activation != "relu") && (activation != "gelu") &&
+        (activation != "none")) {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation));
+    }
+
+    if (activation == "none" && ctx->HasOutput("ReserveSpace")) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The ReserveSpace would not be used when activation = \"none\""));
+    }
+
+    // cublasLt's restriction for auxiliary.
+    if (ctx->HasOutput("ReserveSpace") && activation != "none") {
+      int min_size_of_n = activation == "relu" ? 128 : 8;
+      int N_size = trans_y ? y_dims[0] : y_dims[1];
+      PADDLE_ENFORCE_EQ(N_size % min_size_of_n, 0,
+                        platform::errors::InvalidArgument(
+                            "The output dimension N (X(MxK) * Y(KxN) = C(MxN)) "
+                            "should be multiple of %d when auxiliary_key given "
+                            "and activation=%s, but got N = %d.",
+                            min_size_of_n, activation, N_size));
+    }
+
+    std::vector<int64_t> out_dims;
+    out_dims.reserve(static_cast<size_t>(x_dims.size()));
+    if (trans_x) {
+      for (int i = 1; i < x_dims.size(); ++i) out_dims.push_back(x_dims[i]);
+    } else {
+      for (int i = 0; i < x_dims.size() - 1; ++i) out_dims.push_back(x_dims[i]);
+    }
+
+    if (trans_y) {
+      out_dims.push_back(y_dims[0]);
+    } else {
+      out_dims.push_back(y_dims[1]);
+    }
+
+    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
+    // Note (Ming Huang): Reserve space of relu is a bit-mask,
+    // which cannot pass nan_and_inf checking if shape is set.
+    if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) {
+      ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims));
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+  }
+};
+
+class FusedGemmEpilogueOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor X of Out = Act((X * Y) + Bias).");
+    AddInput("Y", "The input tensor Y of Out = Act((X * Y) + Bias).");
+    AddInput("Bias", "The input tensor bias of Out = Act((X * Y) + Bias).");
+
+    AddOutput("Out", "The output tensor Out of Out = Act((X * Y) + Bias).");
+    AddOutput("ReserveSpace",
+              R"DOC(Reserve GPU space to place 
+        auxiliary data pointer. It is used to pass auxiliary data pointer 
+        for fused_gemm_epilogue op. If not given (empty string), the 
+        auxiliary mode would not be enable.)DOC")
+        .AsDispensable()
+        .AsExtra();
+
+    AddAttr<bool>(
+        "trans_x",
+        R"DOC((bool, default false), Whether to transpose input tensor X 
+    or not. The input tensor X coulbe be more than two dimension. When 
+    set trans_x=true, it would fully reverse X. For instant: X with shpae 
+    [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "trans_y",
+        R"DOC((bool, default false), Whether to transpose input tensor Y 
+    or not. The input tensor Y should be two dimension. When 
+    set trans_y=true, it would transpose Y. For instant: Y with shpae 
+    [d0, d1] -> [d1, d0].)DOC")
+        .SetDefault(false);
+
+    AddAttr<std::string>(
+        "activation",
+        R"DOC((string, default none), The activation function. It could be 
+    one of {none, relu, gelu}. When none is given, Act would be null 
+    operations)DOC")
+        .SetDefault("none");
+
+    AddComment(R"DOC(
+FusedGemmEpilogue Operator
+This operator is used to perform Activeation(Elementwise_add(Matmul(X, Y), bias)).
+It is equal to paddle.nn.Linear + Activation (None, ReLU or GeLU).
+
+Note:
+X could be more than two dimension and would be flatten to 2D for computing.
+X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3]
+)DOC");
+  }
+};
+
+class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("DOut"), "Input", "DOut",
+                   "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput("DY"), "Output", "DY", "FusedGemmEpilogueOp");
+
+    auto dout_dims = ctx->GetInputDim("DOut");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_GE(
+        dout_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor DOut's dimension of FusedGemmEpilogueGradOp "
+            " should be >= 2, but got %d.",
+            dout_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        y_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor Y's dimension of FusedGemmEpilogueGradOp "
+            " should be 2, but got %d.",
+            y_dims.size()));
+
+    PADDLE_ENFORCE_GE(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Input tensor X's dimension of FusedGemmEpilogueGradOp "
+            " should be >= 2, but got %d.",
+            x_dims.size()));
+
+    PADDLE_ENFORCE_EQ(
+        dout_dims.size(), x_dims.size(),
+        platform::errors::InvalidArgument(
+            "The Input tensor DOut's and X's dimension of "
+            "FusedGemmEpilogueGradOp "
+            " should be the same, but got DOut's dim = %d and X's = %d.",
+            dout_dims.size(), x_dims.size()));
+
+    auto dout_mat_dims = phi::flatten_to_2d(dout_dims, dout_dims.size() - 1);
+
+    auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1);
+
+    PADDLE_ENFORCE_EQ(
+        dout_mat_dims[1], y_dims[1],
+        platform::errors::InvalidArgument(
+            "The last dimension of DOut should be equal with Y's last"
+            "dimension. But received DOut[-1] = [%d], Y[1] = [%d].",
+            dout_mat_dims[1], y_dims[1]));
+
+    PADDLE_ENFORCE_EQ(
+        dout_mat_dims[0], x_mat_dims[0],
+        platform::errors::InvalidArgument(
+            "The first dimension of DOut should be equal with X's first"
+            "dimension. But received DOut[0] = [%d], Y[0] = [%d].",
+            dout_mat_dims[0], x_mat_dims[0]));
+
+    auto activation_grad = ctx->Attrs().Get<std::string>("activation_grad");
+    if ((activation_grad != "relu_grad") && (activation_grad != "gelu_grad") &&
+        (activation_grad != "none")) {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation_grad));
+    }
+
+    if (activation_grad != "none" && !ctx->HasInput("ReserveSpace")) {
+      PADDLE_ENFORCE_EQ(true, false,
+                        platform::errors::InvalidArgument(
+                            "The ReserveSpace should not be empty. "
+                            "when activation_grad == {relu_grad, gelu_grad}."));
+    }
+
+    if (ctx->HasOutput("DX")) {
+      std::vector<int64_t> dx_dims;
+      dx_dims.reserve(static_cast<size_t>(x_dims.size()));
+      for (int i = 0; i < x_dims.size(); ++i) {
+        dx_dims.push_back(x_dims[i]);
+      }
+      ctx->SetOutputDim("DX", phi::make_ddim(dx_dims));
+    }
+
+    std::vector<int64_t> dy_dims(y_dims.Get(), y_dims.Get() + y_dims.size());
+    ctx->SetOutputDim("DY", phi::make_ddim(dy_dims));
+
+    if (ctx->HasOutput("DBias")) {
+      std::vector<int64_t> dbias_dims;
+      dbias_dims.push_back(y_dims[1]);
+      ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims));
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DOut");
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+  }
+};
+
+class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("DOut",
+             "The input grad tensor to Out of Out = (Act(X) * Y) + bias");
+    AddInput("X", "The input tensor X of Out = (Act(X) * Y) + bias");
+    AddInput("Y", "The input tensor Y of Out = (Act(X) * Y) + bias");
+    AddInput("ReserveSpace",
+             R"DOC(A GPU space to fetch 
+        auxiliary data pointer. It is used to pass auxiliary data pointer 
+        for fused_gemm_epilogue_grad op. If not given (empty string), the 
+        auxiliary mode would not be enable.)DOC")
+        .AsDispensable();
+
+    AddOutput("DX", "The output grad tensor to X of Out = (Act(X) * Y) + bias.")
+        .AsDispensable();
+    AddOutput("DY",
+              "The output grad tensor to Y of Out = (Act(X) * Y) + bias.");
+    AddOutput("DBias",
+              "The output grad tensor to bias of Out = (Act(X) * Y) + bias.")
+        .AsDispensable();
+
+    AddAttr<std::string>(
+        "activation_grad",
+        R"DOC((string, default none), The backward activation function. It could be 
+    one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would 
+    be null operations)DOC")
+        .SetDefault("none");
+
+    AddComment(R"DOC(
+FusedGemmEpilogueGrad Operator
+This operator is used to perform backward of Elementwise_add(Matmul(Activeation(X), Y), bias).
+It is equal to Activation (None, ReLU or GeLU) + paddle.nn.Linear.
+
+Note:
+X could be more than two dimension and would be flatten to 2D for computing.
+X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3]
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp,
+                  ops::FusedGemmEpilogueOpMaker)
+REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp,
+                  ops::FusedGemmEpilogueGradOpMaker)
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
new file mode 100644
index 00000000000000..e16c9e8f483ccc
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -0,0 +1,376 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* bias = ctx.Input<Tensor>("Bias");
+
+    Tensor* out = ctx.Output<Tensor>("Out");
+    Tensor* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+
+    std::string activation = ctx.Attr<std::string>("activation");
+    bool enable_auxiliary = reserve_space == nullptr ? false : true;
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto* out_data = out->data<T>();
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+    int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0];
+    int64_t K = trans_y ? y->dims()[1] : y->dims()[0];
+    int64_t N = trans_y ? y->dims()[0] : y->dims()[1];
+
+    cudaDataType_t mat_type = CUDA_R_32F;
+    cudaDataType_t scale_type = CUDA_R_32F;
+    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
+    if (std::is_same<T, paddle::platform::float16>::value) {
+      mat_type = CUDA_R_16F;
+      scale_type = CUDA_R_16F;
+    }
+    if (std::is_same<T, double>::value) {
+      mat_type = CUDA_R_64F;
+      scale_type = CUDA_R_64F;
+      compute_type = CUBLAS_COMPUTE_64F;
+    }
+
+    cublasLtMatmulDesc_t operation_desc = NULL;
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+        &operation_desc, compute_type, scale_type));
+    cublasOperation_t transx = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t transy = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transx,
+            sizeof(transx)));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transy,
+            sizeof(transy)));
+
+    cublasLtEpilogue_t epiloque_func =
+        get_epilogue_type_(activation, enable_auxiliary);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epiloque_func,
+            sizeof(epiloque_func)));
+    const T* bias_data = bias->data<T>();
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cublasLtMatmulDescSetAttribute(
+            operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_data,
+            sizeof(bias_data)));
+
+    if (enable_auxiliary && activation != "none") {
+      size_t reserve_space_size = 0;
+      if (activation == "relu") {
+        // Count in bits.
+        reserve_space_size = phi::product(out->dims()) / 8;
+      } else {
+        reserve_space_size = phi::product(out->dims()) * sizeof(T);
+      }
+      reserve_space->mutable_data(ctx.GetPlace(), out->type(),
+                                  reserve_space_size);
+      void* aux_data = reinterpret_cast<void*>(reserve_space->data<T>());
+
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
+              &aux_data, sizeof(aux_data)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N,
+              sizeof(N)));
+    }
+
+    cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL;
+    if (trans_x)
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, M, K, M));
+    else
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, K, M, K));
+    if (trans_y)
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, K, N, K));
+    else
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, N, K, N));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+        &out_desc, mat_type, N, M, N));
+
+    cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
+    size_t workspace_size = 4 * 1024 * 1024;
+    const cublasLtMatmulAlgo_t* algo = nullptr;
+    cudaStream_t stream = dev_ctx.stream();
+    memory::allocation::AllocationPtr workspace =
+        memory::Alloc(dev_ctx, workspace_size);
+
+    double alpha64 = 1.0, beta64 = 0.0;
+    float alpha32 = 1.0f, beta32 = 0.0f;
+    void *alpha = nullptr, *beta = nullptr;
+    if (std::is_same<T, double>::value) {
+      alpha = &alpha64;
+      beta = &beta64;
+    } else {
+      alpha = &alpha32;
+      beta = &beta32;
+    }
+
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+        lt_handle, operation_desc, alpha, y->data<T>(), y_desc, x->data<T>(),
+        x_desc, beta, out_data, out_desc, out_data, out_desc, algo,
+        workspace->ptr(), workspace_size, stream));
+  }
+
+ private:
+  static cublasLtEpilogue_t get_epilogue_type_(const std::string& activation,
+                                               bool enable_auxiliary) {
+    if (activation == "relu") {
+      return enable_auxiliary ? CUBLASLT_EPILOGUE_RELU_AUX_BIAS
+                              : CUBLASLT_EPILOGUE_RELU_BIAS;
+    } else if (activation == "gelu") {
+      return enable_auxiliary ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS
+                              : CUBLASLT_EPILOGUE_GELU_BIAS;
+    } else if (activation == "none") {
+      return CUBLASLT_EPILOGUE_BIAS;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation attribute of fused_gemm_epilogue op should be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation=%s.",
+              activation));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    const Tensor* dout = ctx.Input<Tensor>("DOut");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+
+    Tensor* dx = ctx.Output<Tensor>("DX");
+    Tensor* dy = ctx.Output<Tensor>("DY");
+    Tensor* dbias = ctx.Output<Tensor>("DBias");
+
+    std::string activation_grad = ctx.Attr<std::string>("activation_grad");
+
+    auto dout_mat_dims =
+        phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1);
+    auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1);
+
+    int64_t M = x_mat_dims[0];
+    int64_t K = y->dims()[0];
+    int64_t N = y->dims()[1];
+
+    cudaDataType_t mat_type = CUDA_R_32F;
+    cudaDataType_t scale_type = CUDA_R_32F;
+    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
+    if (std::is_same<T, paddle::platform::float16>::value) {
+      mat_type = CUDA_R_16F;
+      scale_type = CUDA_R_16F;
+    }
+    if (std::is_same<T, double>::value) {
+      mat_type = CUDA_R_64F;
+      scale_type = CUDA_R_64F;
+      compute_type = CUBLAS_COMPUTE_64F;
+    }
+
+    cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
+    size_t workspace_size = 4 * 1024 * 1024;
+    const cublasLtMatmulAlgo_t* algo = nullptr;
+    cudaStream_t stream = dev_ctx.stream();
+
+    double alpha64 = 1.0, beta64 = 0.0;
+    float alpha32 = 1.0f, beta32 = 0.0f;
+    void *alpha = nullptr, *beta = nullptr;
+    if (std::is_same<T, double>::value) {
+      alpha = &alpha64;
+      beta = &beta64;
+    } else {
+      alpha = &alpha32;
+      beta = &beta32;
+    }
+
+    cublasOperation_t trans_dout = CUBLAS_OP_N;
+    cublasLtMatrixLayout_t dout_desc = NULL;
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+        &dout_desc, mat_type, N, M, N));
+
+    if (dx) {
+      cublasLtMatmulDesc_t dx_operation_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+          &dx_operation_desc, compute_type, scale_type));
+      cublasOperation_t trans_y = CUBLAS_OP_T;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout,
+              sizeof(trans_dout)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y,
+              sizeof(trans_y)));
+      cublasLtEpilogue_t epiloque_func_for_dx =
+          get_epilogue_type_(activation_grad);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
+              &epiloque_func_for_dx, sizeof(epiloque_func_for_dx)));
+
+      if (activation_grad != "none") {
+        auto* aux_data = reserve_space->data<T>();
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
+                &aux_data, sizeof(aux_data)));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N,
+                sizeof(N)));
+      }
+
+      cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &y_desc, mat_type, N, K, N));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dx_desc, mat_type, K, M, K));
+
+      memory::allocation::AllocationPtr dx_workspace =
+          memory::Alloc(dev_ctx, workspace_size);
+
+      dx->mutable_data<T>(ctx.GetPlace());
+      auto* dx_data = dx->data<T>();
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+          lt_handle, dx_operation_desc, alpha, y->data<T>(), y_desc,
+          dout->data<T>(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc,
+          algo, dx_workspace->ptr(), workspace_size, stream));
+    }
+
+    if (dy) {
+      cublasLtMatmulDesc_t dy_operation_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate(
+          &dy_operation_desc, compute_type, scale_type));
+      cublasOperation_t trans_x = CUBLAS_OP_T;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout,
+              sizeof(trans_dout)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x,
+              sizeof(trans_x)));
+      cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr
+                                                    ? CUBLASLT_EPILOGUE_DEFAULT
+                                                    : CUBLASLT_EPILOGUE_BGRADA;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          platform::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
+              &epiloque_func_for_dy, sizeof(epiloque_func_for_dy)));
+
+      if (dbias) {
+        dbias->mutable_data<T>(ctx.GetPlace());
+        auto* dbias_data = dbias->data<T>();
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cublasLtMatmulDescSetAttribute(
+                dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+                &dbias_data, sizeof(dbias_data)));
+      }
+
+      cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &x_desc, mat_type, K, M, K));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate(
+          &dy_desc, mat_type, N, K, N));
+
+      memory::allocation::AllocationPtr dy_workspace =
+          memory::Alloc(dev_ctx, workspace_size);
+
+      dy->mutable_data<T>(ctx.GetPlace());
+      auto* dy_data = dy->data<T>();
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul(
+          lt_handle, dy_operation_desc, alpha, dout->data<T>(), dout_desc,
+          x->data<T>(), x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, algo,
+          dy_workspace->ptr(), workspace_size, stream));
+    }
+  }
+
+ private:
+  static cublasLtEpilogue_t get_epilogue_type_(
+      const std::string& activation_grad) {
+    if (activation_grad == "relu_grad") {
+      return CUBLASLT_EPILOGUE_DRELU;
+    } else if (activation_grad == "gelu_grad") {
+      return CUBLASLT_EPILOGUE_DGELU;
+    } else if (activation_grad == "none") {
+      return CUBLASLT_EPILOGUE_DEFAULT;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The activation_grad attribute of fused_gemm_epilogue op should "
+              "be"
+              " one of {\"none\", \"relu\", \"gelu\"}. But received %s."
+              "But received activation_grad=%s.",
+              activation_grad));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDA_VERSION >= 11060
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fused_gemm_epilogue,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext,
+                                 paddle::platform::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fused_gemm_epilogue_grad,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>,
+    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+                                     paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index fcd3384ac24444..e5ca15a39ef51f 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -130,11 +130,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherNdGradNoNeedBufferVarInferer, "X");
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor,
-                            PT_INFER_META(phi::GatherNdInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor,
+                            PD_INFER_META(phi::GatherNdInferMeta));
 
-DELCARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor,
-                            PT_INFER_META(phi::GatherNdGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor,
+                            PD_INFER_META(phi::GatherNdGradInferMeta));
 
 REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker,
                   ops::GatherNdGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc
index 7f6c82032fe39d..c84e94f5c71277 100644
--- a/paddle/fluid/operators/gather_tree_op.cc
+++ b/paddle/fluid/operators/gather_tree_op.cc
@@ -61,8 +61,8 @@ selected ids.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
-                            PT_INFER_META(phi::GatherTreeMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
+                            PD_INFER_META(phi::GatherTreeMeta));
 
 REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker,
                   GatherTreeInferShapeFunctor);
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 6b559885c569d0..66eecc13d04d1a 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -15,12 +15,14 @@ limitations under the License. */
 #include <random>
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
@@ -54,38 +56,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GaussianRandom");
-
-    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    std::vector<int64_t> temp;
-    temp.reserve(shape.size());
-    for (auto dim : shape) {
-      temp.push_back(static_cast<int64_t>(dim));
-    }
-    if (shape.empty() && ctx->HasInput("ShapeTensor")) {
-      auto shape_dims = ctx->GetInputDim("ShapeTensor");
-      int num_ele = 1;
-      for (int i = 0; i < shape_dims.size(); ++i) {
-        num_ele *= shape_dims[i];
-      }
-      auto vec_dims = std::vector<int>(num_ele, -1);
-      ctx->SetOutputDim("Out", phi::make_ddim(vec_dims));
-
-      return;
-    }
-    if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) {
-      PADDLE_ENFORCE_GT(
-          shape.size(), 0UL,
-          platform::errors::InvalidArgument(
-              "Attribute(shape) of GaussianRandomOp must be set "
-              "and shape.size() > 0, but reveived shape.size() is %d",
-              shape.size()));
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(temp));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -171,11 +141,20 @@ Used to initialize tensors with gaussian random generator.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
-                             ops::GaussianRandomOpMaker);
+
+DECLARE_INFER_SHAPE_FUNCTOR(gaussian_random, GaussianRandomInferShapeFunctor,
+                            PD_INFER_META(phi::GaussianRandomInferMeta));
+
+REGISTER_OPERATOR(
+    gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    GaussianRandomInferShapeFunctor);
+
 REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<float>,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<double>);
+
 REGISTER_OP_VERSION(gaussian_random)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 717ec774414bf8..00ce10bfe3bccb 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -45,7 +45,8 @@ struct GaussianGenerator {
     thrust::minstd_rand rng;
     rng.seed(seed_);
     using MT = typename details::MPTypeTrait<T>::Type;
-    thrust::normal_distribution<MT> dist(mean_, std_);
+    thrust::normal_distribution<MT> dist(static_cast<MT>(mean_),
+                                         static_cast<MT>(std_));
     unsigned int new_n = n + offset_;
     rng.discard(new_n);
     MT out = dist(rng);
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index 00ff7ad2166dcf..f3ac53138328db 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
index 6af8388d9eba4e..f7c006dbcb1a9a 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/graph_send_recv_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,59 +24,6 @@ class GraphSendRecvOP : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index",
-                   "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index",
-                   "GraphSendRecv");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv");
-
-    auto src_index_dims = ctx->GetInputDim("Src_index");
-    if (src_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(src_index_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "The last dim of Src_index should be 1 when it "
-                            "is 2D, but we get %d",
-                            src_index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          src_index_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The Src_index should be 1D, when it is not 2D, but we get %d",
-              src_index_dims.size()));
-    }
-
-    auto dst_index_dims = ctx->GetInputDim("Dst_index");
-    if (dst_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(dst_index_dims[1], 1,
-                        platform::errors::InvalidArgument(
-                            "The last dim of Dst_index should be 1 when it "
-                            "is 2D, but we get %d",
-                            dst_index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          dst_index_dims.size(), 1,
-          platform::errors::InvalidArgument("The Dst_index should be 1D, "
-                                            "when it is not 2D, but we get %d",
-                                            dst_index_dims.size()));
-    }
-
-    PADDLE_ENFORCE_EQ(
-        src_index_dims[0], dst_index_dims[0],
-        platform::errors::InvalidArgument(
-            "Src_index and Dst_index should have the same shape."));
-
-    auto dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", dims);
-
-    if (ctx->Attrs().Get<std::string>("pool_type") == "MEAN") {
-      OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count",
-                     "GraphSendRecv");
-      ctx->SetOutputDim("Dst_count", {dims[0]});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -164,20 +114,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, GraphSendRecvInferShapeFunctor,
+                            PD_INFER_META(phi::GraphSendRecvInferMeta));
 REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP,
                   ops::GraphSendRecvOpMaker,
                   ops::GraphSendRecvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>,
+                  GraphSendRecvInferShapeFunctor);
 REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp);
-REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel<CPU, float>,
-                       ops::GraphSendRecvOpKernel<CPU, double>,
-                       ops::GraphSendRecvOpKernel<CPU, int>,
-                       ops::GraphSendRecvOpKernel<CPU, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(graph_send_recv_grad,
-                       ops::GraphSendRecvGradOpKernel<CPU, float>,
-                       ops::GraphSendRecvGradOpKernel<CPU, double>,
-                       ops::GraphSendRecvGradOpKernel<CPU, int>,
-                       ops::GraphSendRecvGradOpKernel<CPU, int64_t>);
diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu
deleted file mode 100644
index f43d31814ac384..00000000000000
--- a/paddle/fluid/operators/graph_send_recv_op.cu
+++ /dev/null
@@ -1,419 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/device_vector.h>
-#include <thrust/fill.h>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/graph_send_recv_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T, typename IndexT>
-struct GraphSendRecvSumCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT>
-struct GraphSendRecvMaxCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT>
-struct GraphSendRecvMinCUDAFunctor {
-  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
-                                const IndexT& out_i) {
-    paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
-  }
-};
-
-template <typename T, typename IndexT, typename Functor>
-__global__ void GraphSendRecvCUDAKernel(const T* params,
-                                        const IndexT* src_indices,
-                                        const IndexT* dst_indices, T* output,
-                                        size_t index_size, size_t slice_size,
-                                        Functor functor) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    functor(params, output, in_i, out_i);
-  }
-}
-
-// For max
-template <typename T>
-__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size,
-                                        size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    if (*(output + i) == std::numeric_limits<T>::min()) {
-      *(output + i) = 0;
-    }
-  }
-}
-
-// For min
-template <typename T>
-__global__ void InputResetMinCUDAKernel(T* output, size_t input_size,
-                                        size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    if (*(output + i) == std::numeric_limits<T>::max()) {
-      *(output + i) = 0;
-    }
-  }
-}
-
-// Get dst_count
-template <typename T, typename IndexT>
-__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices,
-                                       size_t index_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) {
-    IndexT dst_i = dst_indices[i];
-    paddle::platform::CudaAtomicAdd(count + dst_i, 1);
-  }
-}
-
-// For forward mean
-template <typename T>
-__global__ void ManipulateMeanCUDAKernel(T* output, int* count,
-                                         size_t input_size, size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    int64_t c_index = i / slice_size;
-    if (*(count + c_index) > 1) {
-      *(output + i) = *(output + i) / *(count + c_index);
-    }
-  }
-}
-
-// For backward mean
-template <typename T, typename IndexT>
-__global__ void ManipulateMeanGradCUDAKernel(
-    const T* params, const IndexT* src_indices, const IndexT* dst_indices,
-    T* output, size_t index_size, size_t slice_size, const int* dst_count) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    paddle::platform::CudaAtomicAdd(output + out_i,
-                                    *(params + in_i) / dst_count[src_i]);
-  }
-}
-
-// For backward min and max
-template <typename T, typename IndexT>
-__global__ void ManipulateMinMaxGradCUDAKernel(
-    const T* params, const IndexT* src_indices, const IndexT* dst_indices,
-    T* output, size_t index_size, size_t slice_size, const T* ptr_input,
-    const T* ptr_output) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;
-    IndexT src_i = src_indices[indices_i];
-    IndexT dst_i = dst_indices[indices_i];
-    int64_t in_i = src_i * slice_size + slice_i;
-    int64_t out_i = dst_i * slice_size + slice_i;
-    paddle::platform::CudaAtomicAdd(
-        output + out_i,
-        *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i)));
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvOpCUDAKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index,
-    const Tensor& dst_index) {
-  auto* X = ctx.Input<Tensor>("X");
-  auto* Y = ctx.Output<Tensor>("Out");
-  std::string pool_type = ctx.Attr<std::string>("pool_type");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) {
-    memset_size *= src_dims[i];
-  }
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  if (pool_type == "SUM" || pool_type == "MEAN") {
-#ifdef PADDLE_WITH_HIP
-    hipMemset(p_output, 0, memset_bytes);
-#else
-    cudaMemset(p_output, 0, memset_bytes);
-#endif
-  } else if (pool_type == "MAX") {
-    thrust::device_ptr<T> p_output_ptr(p_output);
-    thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                 std::numeric_limits<T>::min());
-  } else if (pool_type == "MIN") {
-    thrust::device_ptr<T> p_output_ptr(p_output);
-    thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                 std::numeric_limits<T>::max());
-  }
-
-  if (index_size == 0) return;
-
-  int64_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) {
-    slice_size *= src_dims[i];
-  }
-  const T* p_src = X->data<T>();
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index.data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
-  int block = 1024;
-#endif
-  int64_t n = slice_size * index_size;
-  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t grid_tmp = (n + block - 1) / block;
-  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
-  int64_t input_size = src_dims[0];
-  if (pool_type == "SUM") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-  } else if (pool_type == "MAX") {
-    GraphSendRecvMaxCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvMaxCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_max =
-        grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
-    InputResetMaxCUDAKernel<
-        T><<<grid_max, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, input_size, slice_size);
-  } else if (pool_type == "MIN") {
-    GraphSendRecvMinCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvMinCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_min =
-        grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
-    InputResetMinCUDAKernel<
-        T><<<grid_min, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, input_size, slice_size);
-  } else if (pool_type == "MEAN") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-
-    auto* dst_count = ctx.Output<Tensor>("Dst_count");
-    int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
-
-#ifdef PADDLE_WITH_HIP
-    hipMemset(p_dst_count, 0, input_size * sizeof(int));
-#else
-    cudaMemset(p_dst_count, 0, input_size * sizeof(int));
-#endif
-
-    int64_t grid_count = (index_size + block - 1) / block;
-    ComputeCountCUDAKernel<
-        T, IndexT><<<grid_count, block, 0,
-                     reinterpret_cast<const platform::CUDADeviceContext&>(
-                         ctx.device_context())
-                         .stream()>>>(p_dst_count, d_index, index_size);
-
-    int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block;
-    int64_t grid_mean =
-        grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx;
-    ManipulateMeanCUDAKernel<
-        T><<<grid_mean, block, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(
-                 ctx.device_context())
-                 .stream()>>>(p_output, p_dst_count, input_size, slice_size);
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvGradOpCUDAKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index,
-    const Tensor& dst_index) {
-  auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
-  std::string pool_type = ctx.Attr<std::string>("pool_type");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) {
-    memset_size *= src_dims[i];
-  }
-  const size_t& memset_bytes = memset_size * sizeof(T);
-
-#ifdef PADDLE_WITH_HIP
-  hipMemset(p_output, 0, memset_bytes);
-#else
-  cudaMemset(p_output, 0, memset_bytes);
-#endif
-
-  if (index_size == 0) return;
-
-  int64_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) {
-    slice_size *= src_dims[i];
-  }
-  const T* p_src = X->data<T>();
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index.data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
-  int block = 1024;
-#endif
-  int64_t n = slice_size * index_size;
-  const auto& dev_ctx = ctx.cuda_device_context();
-  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-  int64_t grid_tmp = (n + block - 1) / block;
-  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
-  int64_t input_size = src_dims[0];
-  if (pool_type == "SUM") {
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
-    GraphSendRecvCUDAKernel<T, IndexT,
-                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, functor);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Input<Tensor>("Dst_count");
-    const int* s_count = dst_count->data<int>();
-    ManipulateMeanGradCUDAKernel<T, IndexT><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, s_count);
-  } else if (pool_type == "MAX" || pool_type == "MIN") {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Input<Tensor>("Out");
-    const T* ptr_input = input->data<T>();
-    const T* ptr_output = output->data<T>();
-    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<
-        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                            ctx.device_context())
-                            .stream()>>>(p_src, s_index, d_index, p_output,
-                                         index_size, slice_size, ptr_input,
-                                         ptr_output);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Src_index");
-    auto* dst_index = ctx.Input<Tensor>("Dst_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
-          ctx, *src_index, *dst_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index, *dst_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Dst_index");
-    auto* dst_index = ctx.Input<Tensor>("Src_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
-          ctx, *src_index, *dst_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index, *dst_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-using CUDA = paddle::platform::CUDADeviceContext;
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(graph_send_recv,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, float>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, double>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, int>,
-                        ops::GraphSendRecvOpCUDAKernel<CUDA, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, float>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, double>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, int>,
-                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/graph_send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h
deleted file mode 100644
index 8d8111e0ee845b..00000000000000
--- a/paddle/fluid/operators/graph_send_recv_op.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct GraphSendRecvSumFunctor {
-  void operator()(const bool& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    eigen_dst += eigen_src;
-  }
-};
-
-template <typename T>
-struct GraphSendRecvMinFunctor {
-  void operator()(const bool& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    if (first_flag) {
-      eigen_dst += eigen_src;
-    } else {
-      eigen_dst = eigen_dst.cwiseMin(eigen_src);
-    }
-  }
-};
-
-template <typename T>
-struct GraphSendRecvMaxFunctor {
-  void operator()(const int& first_flag, const Tensor& src_slice,
-                  Tensor* dst_slice) {
-    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    if (first_flag) {
-      eigen_dst += eigen_src;
-    } else {
-      eigen_dst = eigen_dst.cwiseMax(eigen_src);
-    }
-  }
-};
-
-template <typename T, typename IndexT, typename Functor>
-void elementwise_inner_operation(const Tensor& src, Tensor* dst,
-                                 const IndexT& src_index,
-                                 const IndexT& dst_index,
-                                 const bool& first_flag, Functor functor) {
-  auto src_slice = src.Slice(src_index, src_index + 1);
-  auto dst_slice = dst->Slice(dst_index, dst_index + 1);
-
-  functor(first_flag, src_slice, &dst_slice);
-}
-
-template <typename T, typename IndexT, typename Functor>
-void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size,
-                                  const IndexT* s_index, const IndexT* d_index,
-                                  const Tensor& src, Tensor* dst,
-                                  const std::string& pool_type,
-                                  int* dst_count = nullptr) {
-  Functor functor;
-  if (pool_type == "SUM") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-  } else if (pool_type == "MEAN") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-    for (int i = 0; i < index_size; ++i) {
-      IndexT dst_idx = d_index[i];
-      *(dst_count + dst_idx) += 1;
-    }
-    for (int i = 0; i < input_size; ++i) {
-      if (*(dst_count + i) == 0) continue;
-      auto dst_slice = dst->Slice(i, i + 1);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst = eigen_dst / static_cast<T>(*(dst_count + i));
-    }
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    std::set<IndexT> existed_dst;
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
-      if (!in_set) {
-        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                        dst_idx, true, functor);
-        existed_dst.emplace(dst_idx);
-      } else {
-        elementwise_inner_operation<T, IndexT, Functor>(
-            src, dst, src_idx, dst_idx, false, functor);
-      }
-    }
-  }
-}
-
-template <typename T, typename IndexT, typename Functor>
-void graph_send_recv_cpu_for_loop_grad(
-    const int& input_size, const int& index_size, const IndexT* s_index,
-    const IndexT* d_index, const Tensor& src, Tensor* dst,
-    const std::string& pool_type, const int* dst_count = nullptr,
-    const Tensor* input = nullptr, const Tensor* output = nullptr) {
-  if (pool_type == "SUM") {
-    Functor functor;
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                      dst_idx, false, functor);
-    }
-  } else if (pool_type == "MEAN") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& src_idx = s_index[i];
-      const IndexT& dst_idx = d_index[i];
-      auto src_slice = src.Slice(src_idx, src_idx + 1);
-      auto dst_slice = dst->Slice(dst_idx, dst_idx + 1);
-      auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst += (eigen_src / static_cast<T>(dst_count[src_idx]));
-    }
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    for (int i = 0; i < index_size; ++i) {
-      const IndexT& forward_src_idx = d_index[i];
-      const IndexT& forward_dst_idx = s_index[i];
-      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
-      auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
-      auto eigen_input = framework::EigenVector<T>::Flatten(input_slice);
-      auto eigen_output = framework::EigenVector<T>::Flatten(output_slice);
-
-      auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1);
-      auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1);
-      auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst += eigen_src * (eigen_output == eigen_input);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx,
-                                       const Tensor& src_index) {
-  auto* X = ctx.Input<Tensor>("X");
-  auto* dst_index = ctx.Input<Tensor>("Dst_index");
-  auto* Y = ctx.Output<Tensor>("Out");
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  memset(p_output, 0, memset_bytes);
-
-  if (index_size == 0) return;
-
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index->data<IndexT>();
-  const std::string& pool_type = ctx.Attr<std::string>("pool_type");
-  if (pool_type == "SUM") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MIN") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvMinFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MAX") {
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvMaxFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Output<Tensor>("Dst_count");
-    int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
-    memset(p_dst_count, 0, src_dims[0] * sizeof(int));
-    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type,
-        p_dst_count);
-  }
-}
-
-template <typename DeviceContext, typename T, typename IndexT>
-void GraphSendRecvGradOpKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index) {
-  auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* dst_index = ctx.Input<Tensor>("Src_index");
-  auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-  const int& index_size = src_index.dims()[0];
-
-  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-  const auto& src_dims = X->dims();
-  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
-  const size_t& memset_bytes = memset_size * sizeof(T);
-  memset(p_output, 0, memset_bytes);
-
-  if (index_size == 0) return;
-
-  const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index->data<IndexT>();
-
-  const std::string& pool_type = ctx.Attr<std::string>("pool_type");
-  if (pool_type == "SUM") {
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-  } else if (pool_type == "MEAN") {
-    auto* dst_count = ctx.Input<Tensor>("Dst_count");
-    const int* s_count = dst_count->data<int>();
-    // Functor not used here.
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count);
-  } else if (pool_type == "MIN" || pool_type == "MAX") {
-    const auto* input = ctx.Input<Tensor>("X");
-    const auto* output = ctx.Input<Tensor>("Out");
-    // Functor not used here.
-    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvMinFunctor<T>>(
-        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr,
-        input, output);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Src_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvOpKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx,
-                                                                   *src_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GraphSendRecvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* src_index = ctx.Input<Tensor>("Dst_index");
-    auto index_type = framework::TransToProtoVarType(src_index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      GraphSendRecvGradOpKernelLaunchHelper<DeviceContext, T, int>(ctx,
-                                                                   *src_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GraphSendRecvGradOpKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
-          "got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cc b/paddle/fluid/operators/gumbel_softmax_op.cc
index f8f8f3fd789ad6..524f2d6c9d7194 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cc
+++ b/paddle/fluid/operators/gumbel_softmax_op.cc
@@ -90,11 +90,11 @@ class GumbelSoftmaxGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor,
-                            PT_INFER_META(phi::GumbelSoftmaxInferMeta));
-DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad,
+DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor,
+                            PD_INFER_META(phi::GumbelSoftmaxInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad,
                             GumbelSoftmaxGradInferShapeFunctor,
-                            PT_INFER_META(phi::GumbelSoftmaxGradInferMeta));
+                            PD_INFER_META(phi::GumbelSoftmaxGradInferMeta));
 
 REGISTER_OPERATOR(gumbel_softmax, ops::GumbelSoftmaxOp,
                   ops::GumbelSoftmaxOpMaker,
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 3915ce5809c394..3c9bbc753f29b1 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -112,8 +112,8 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor,
-                            PT_INFER_META(phi::HuberLossInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor,
+                            PD_INFER_META(phi::HuberLossInferMeta));
 
 REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
                   ops::HuberLossGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc
index 567a69f383d1cc..16968876ac96ca 100644
--- a/paddle/fluid/operators/imag_op.cc
+++ b/paddle/fluid/operators/imag_op.cc
@@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
-                            PT_INFER_META(phi::RealAndImagInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
+                            PD_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index 105d818e197434..e2efaa1759b008 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -87,8 +87,8 @@ class IncrementGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor,
-                            PT_INFER_META(phi::IncrementInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor,
+                            PD_INFER_META(phi::IncrementInferMeta));
 REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
                   ops::IncrementGradOpMaker<paddle::framework::OpDesc>,
                   ops::IncrementGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
index 09f4e63943ad37..8324a6215bca81 100644
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc
index 68d002fceea70f..d17c6368c7537b 100644
--- a/paddle/fluid/operators/index_sample_op.cc
+++ b/paddle/fluid/operators/index_sample_op.cc
@@ -100,8 +100,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
-                            PT_INFER_META(phi::IndexSampleInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
+                            PD_INFER_META(phi::IndexSampleInferMeta));
 REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker,
                   ops::IndexSampleGradMaker<paddle::framework::OpDesc>,
                   ops::IndexSampleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/inverse_op.h b/paddle/fluid/operators/inverse_op.h
index 1e061d8b50ae02..31c22915ec5d05 100644
--- a/paddle/fluid/operators/inverse_op.h
+++ b/paddle/fluid/operators/inverse_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
 
 namespace paddle {
 namespace operators {
@@ -30,7 +30,7 @@ class InverseKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(context.GetPlace());
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
     mat_inv(dev_ctx, *input, output);
   }
 };
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index 2750367dc77392..c835bb3cf60bfb 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/is_empty_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -24,12 +26,6 @@ class IsEmptyOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "IsEmpty");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "IsEmpty");
-    ctx->SetOutputDim("Out", {1});
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto *x = ctx.Input<framework::LoDTensor>("X");
@@ -56,12 +52,10 @@ It will just return product(tensor.ddims()) > 0;
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(is_empty, IsEmptyInferShapeFunctor,
+                            PD_INFER_META(phi::IsEmptyInferMeta));
 REGISTER_OPERATOR(
     is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    is_empty, ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsEmptyInferShapeFunctor);
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
index 735fffa7203b12..cfa370ff9cb19d 100644
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/isfinite_v2_op.h"
-
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace framework {
@@ -49,11 +51,6 @@ class OverflowV2Op : public framework::OperatorWithKernel {
                const framework::VariableNameMap &outputs,
                const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2");
-    UnaryOpUnchangedInferShape(ctx);
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -104,6 +101,14 @@ element of X as a tensor.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(isinf_v2, IsinfInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(isnan_v2, IsnanInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
+
+DECLARE_INFER_SHAPE_FUNCTOR(isfinite_v2, IsfiniteInferShapeFunctor,
+                            PD_INFER_META(phi::IsfiniteInferMeta));
 
 #define REGISTER_V2OP_MAKER(op_type, comment)           \
   namespace paddle {                                    \
@@ -124,50 +129,17 @@ REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)");
 REGISTER_OPERATOR(
     isinf_v2, ops::OverflowV2Op, ops::_isinf_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsinfInferShapeFunctor);
 
 REGISTER_OPERATOR(
     isnan_v2, ops::OverflowV2Op, ops::_isnan_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsnanInferShapeFunctor);
 
 REGISTER_OPERATOR(
     isfinite_v2, ops::OverflowV2Op, ops::_isfinite_v2OverflowV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(isnan_v2,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           double, ops::NANV2Functor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           plat::float16, ops::NANV2Functor>);
-
-REGISTER_OP_CPU_KERNEL(
-    isinf_v2, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,
-                                  ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, plat::float16,
-                        ops::InfinityV2Functor>);
-
-REGISTER_OP_CPU_KERNEL(
-    isfinite_v2, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int,
-                                     ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, int64_t,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CPUDeviceContext, plat::float16,
-                        ops::IsfiniteV2Functor>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    IsfiniteInferShapeFunctor);
diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu
deleted file mode 100644
index 1b9f19d36dfa0f..00000000000000
--- a/paddle/fluid/operators/isfinite_v2_op.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/isfinite_v2_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(isnan_v2,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            int, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            int64_t, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            double, ops::NANV2Functor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            plat::float16, ops::NANV2Functor>);
-
-REGISTER_OP_CUDA_KERNEL(
-    isinf_v2, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                                  ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int64_t,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
-                        ops::InfinityV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16,
-                        ops::InfinityV2Functor>);
-
-REGISTER_OP_CUDA_KERNEL(
-    isfinite_v2, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,
-                                     ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, int64_t,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,
-                        ops::IsfiniteV2Functor>,
-    ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16,
-                        ops::IsfiniteV2Functor>);
diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu
index 4f30c58d375008..f6f56f70f1a119 100644
--- a/paddle/fluid/operators/kthvalue_op.cu
+++ b/paddle/fluid/operators/kthvalue_op.cu
@@ -16,7 +16,6 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/kthvalue_op.h"
 #include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc
index fef6fc5319ebdb..5e053445379118 100644
--- a/paddle/fluid/operators/lerp_op.cc
+++ b/paddle/fluid/operators/lerp_op.cc
@@ -85,8 +85,8 @@ DECLARE_INPLACE_OP_INFERER(LerpInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor,
-                            PT_INFER_META(phi::LerpInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor,
+                            PD_INFER_META(phi::LerpInferMeta));
 REGISTER_OPERATOR(
     lerp, paddle::operators::LerpOp, paddle::operators::LerpOpMaker,
     paddle::operators::LerpOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index fe271fa5e893a7..378c7573d6129a 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/linspace_op.h"
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,33 +27,6 @@ class LinspaceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
-    OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
-    OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "linspace");
-
-    auto s_dims = ctx->GetInputDim("Start");
-    PADDLE_ENFORCE_EQ((s_dims.size() == 1) && (s_dims[0] == 1), true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Start) must be [1],"
-                          "but received input shape is [%s].",
-                          s_dims));
-    auto e_dims = ctx->GetInputDim("Stop");
-    PADDLE_ENFORCE_EQ((e_dims.size() == 1) && (e_dims[0] == 1), true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Stop) must be [1],"
-                          "but received input shape is [%s].",
-                          e_dims));
-    auto step_dims = ctx->GetInputDim("Num");
-    PADDLE_ENFORCE_EQ(
-        (step_dims.size() == 1) && (step_dims[0] == 1), true,
-        platform::errors::InvalidArgument("The shape of Input(Num) must be [1],"
-                                          "but received input shape is [%s].",
-                                          step_dims));
-    ctx->SetOutputDim("Out", {-1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -88,11 +65,13 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
-REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
-                       ops::CPULinspaceKernel<int32_t>,
-                       ops::CPULinspaceKernel<int64_t>,
-                       ops::CPULinspaceKernel<double>);
+DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor,
+                            PD_INFER_META(phi::LinspaceInferMeta));
+REGISTER_OPERATOR(
+    linspace, ops::LinspaceOp, ops::LinspaceOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    LinspaceInferShapeFunctor);
 
 REGISTER_OP_VERSION(linspace)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
deleted file mode 100644
index aa625a7f5b9df0..00000000000000
--- a/paddle/fluid/operators/linspace_op.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/linspace_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void LinspaceKernel(T start, T stop, double step, int64_t size,
-                               T* out) {
-  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (; index < size; index += blockDim.x * gridDim.x) {
-    if (index < size / 2) {
-      out[index] = static_cast<T>(start + step * index);
-    } else {
-      out[index] = static_cast<T>(stop - step * (size - index - 1));
-    }
-  }
-}
-
-template <typename T>
-__global__ void LinspaceSpecialKernel(T start, T* out) {
-  out[0] = static_cast<T>(start);
-}
-
-template <typename T>
-class CUDALinspaceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pre_start = context.Input<framework::Tensor>("Start");
-    auto* pre_stop = context.Input<framework::Tensor>("Stop");
-    auto* num_t = context.Input<framework::Tensor>("Num");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-
-    Tensor start_t;
-    Tensor stop_t;
-    auto start_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace());
-    auto stop_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace());
-    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
-    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
-    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
-
-    framework::Tensor n_start;
-    framework::Tensor n_stop;
-    framework::Tensor n_num;
-    framework::TensorCopy(start_t, platform::CPUPlace(), &n_start);
-    T start = n_start.data<T>()[0];
-    framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop);
-    T stop = n_stop.data<T>()[0];
-    framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num);
-    int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
-
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "The num of linspace op should be larger "
-                                  "than 0, but received num is %d",
-                                  num));
-
-    out->Resize(phi::make_ddim({num}));
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    double step = 0;
-    auto stream = context.cuda_device_context().stream();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    if (num != 1) {
-      step = (static_cast<double>(stop - start)) / (num - 1);
-      LinspaceKernel<T><<<grid, block, 0, stream>>>(start, stop, step, num,
-                                                    out_data);
-    } else {
-      LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start, out_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
-                        ops::CUDALinspaceKernel<int32_t>,
-                        ops::CUDALinspaceKernel<int64_t>,
-                        ops::CUDALinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
deleted file mode 100644
index ae51f1221cc09b..00000000000000
--- a/paddle/fluid/operators/linspace_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class CPULinspaceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* pre_start = context.Input<framework::Tensor>("Start");
-    auto* pre_stop = context.Input<framework::Tensor>("Stop");
-    int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto dtype = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-
-    Tensor start_t;
-    Tensor stop_t;
-    auto start_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace());
-    auto stop_dtype = framework::OpKernelType(
-        framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace());
-    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
-    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
-    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
-
-    T start = start_t.data<T>()[0];
-    T stop = stop_t.data<T>()[0];
-    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
-                                  "The num of linspace op should be larger "
-                                  "than 0, but received num is %d",
-                                  num));
-
-    out->Resize(phi::make_ddim({num}));
-
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    if (num > 1) {
-      // step should be of double type for all types
-      double step = (static_cast<double>(stop - start)) / (num - 1);
-      int half_num = num / 2;
-      for (int i = 0; i < num; ++i) {
-        if (i < half_num) {
-          out_data[i] = static_cast<T>(start + step * i);
-        } else {
-          out_data[i] = static_cast<T>(stop - step * (num - i - 1));
-        }
-      }
-    } else {
-      out_data[0] = static_cast<T>(start);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index 2e596ff3e62573..883e3597d8a311 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,43 +24,6 @@ namespace operators {
 class LogLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Predicted"), "Input", "Predicted", "LogLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLoss");
-
-    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
-
-    if (ctx->IsRuntime() ||
-        (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) {
-      PADDLE_ENFORCE_EQ(
-          pred_dims, label_dims,
-          platform::errors::InvalidArgument(
-              "The dimensions of Input(Predicted) must be equal to the"
-              "dimensions of Input(Labels), but received dimensions of "
-              "Input(Predicted)"
-              "is [%s], received dimensions of Input(Labels) is [%s].",
-              pred_dims, label_dims));
-    }
-    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of Input(Predicted) must be 2,"
-                          "But received dimensions of Input(Predicted)"
-                          "is [%d]",
-                          pred_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          pred_dims[1], 1,
-          platform::errors::InvalidArgument(
-              "Each row of Input(Predicted) contains a real value, "
-              "so the 2nd dimension of Input(X) must be 1,"
-              "But got [%d]",
-              pred_dims[1]));
-    }
-    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
-    ctx->ShareLoD("Predicted", "Loss");
-  }
 };
 
 template <typename AttrType>
@@ -145,7 +111,10 @@ class LogLossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(log_loss, LogLossInferShapeFunctor,
+                            PD_INFER_META(phi::LogLossInferMeta));
 REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
                   ops::LogLossGradMaker<paddle::framework::OpDesc>,
-                  ops::LogLossGradMaker<paddle::imperative::OpBase>);
+                  ops::LogLossGradMaker<paddle::imperative::OpBase>,
+                  LogLossInferShapeFunctor);
 REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 14b12ca3acb19a..31a98d9f630e1c 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -5,6 +5,8 @@ endif()
 # please add new math_library in alphabetical order
 if (WITH_ASCEND_CL)
 math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
+elseif (WITH_MLU)
+math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
 else()
 math_library(concat_and_split DEPS concat_and_split_functor)
 endif()
@@ -44,8 +46,6 @@ math_library(vol2col)
 math_library(prelu)
 math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
-math_library(matrix_inverse)
-math_library(segment_pooling)
 math_library(matrix_solve)
 
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 46126ac59c8927..c9308d27c0a349 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -18,6 +18,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#endif
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
@@ -226,6 +229,90 @@ class SplitFunctor<platform::NPUDeviceContext, T> {
 };
 #endif
 
+#ifdef PADDLE_WITH_MLU
+template <typename T>
+class ConcatFunctor<platform::MLUDeviceContext, T> {
+ public:
+  void operator()(const platform::MLUDeviceContext& context,
+                  const std::vector<framework::Tensor>& input, int axis,
+                  framework::Tensor* output) {
+    int dev_id = context.GetPlace().GetDeviceId();
+    platform::MLUDeviceGuard guard(dev_id);
+
+    auto ins_size = input.size();
+
+    const int axis_t = axis;
+    const int ins_size_t = ins_size;
+    auto place = context.GetPlace();
+    output->mutable_data<T>(place);
+
+    // mlu should do sth
+    // init ins tensors
+    std::vector<const void*> inputs;
+    std::vector<MLUCnnlTensorDesc> input_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    for (size_t i = 0; i < ins_size; i++) {
+      input_descs.emplace_back(MLUCnnlTensorDesc(
+          input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype())));
+      desc_vector.push_back(input_descs.back().get());
+      inputs.push_back(input[i].data());
+    }
+    // init out tensors
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->dtype()));
+
+    // MLU should do sth
+    MLUCnnl::Concat(context, ins_size_t, axis_t, desc_vector.data(),
+                    inputs.data(), output_desc.get(), GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class SplitFunctor<platform::MLUDeviceContext, T> {
+ public:
+  void operator()(const platform::MLUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
+    if (input.numel() == 0) {
+      return;
+    }
+
+    int dev_id = context.GetPlace().GetDeviceId();
+    platform::MLUDeviceGuard guard(dev_id);
+
+    auto in_dims = input.dims();
+    auto out_size = outputs->size();
+
+    std::vector<framework::DDim> outs_dims(out_size, in_dims);
+    for (size_t i = 0; i < out_size; ++i) {
+      outs_dims[i][axis] = ref_inputs[i]->dims()[axis];
+    }
+
+    // init out tensors
+    std::vector<void*> vct_tensor;
+    std::vector<MLUCnnlTensorDesc> output_descs;
+    std::vector<cnnlTensorDescriptor_t> desc_vector;
+    for (size_t i = 0; i < out_size; i++) {
+      (*outputs)[i]->Resize(outs_dims[i]);
+      (*outputs)[i]->mutable_data<T>(context.GetPlace());
+      output_descs.emplace_back(
+          MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY,
+                            ToCnnlDataType((*outputs)[i]->dtype())));
+      desc_vector.push_back(output_descs.back().get());
+      vct_tensor.push_back(GetBasePtr((*outputs)[i]));
+    }
+    // init in tensors
+    MLUCnnlTensorDesc input_desc(input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input.dtype()));
+
+    // MLU should do sth
+    MLUCnnl::Split(context, out_size, axis, input_desc.get(), input.data(),
+                   desc_vector.data(), vct_tensor.data());
+  }
+};
+#endif
+
 #define DEFINE_FUNCTOR(type)                                      \
   template class ConcatFunctor<platform::CPUDeviceContext, type>; \
   template class SplitFunctor<platform::CPUDeviceContext, type>;
@@ -248,6 +335,19 @@ DEFINE_XPU_FUNCTOR(float)
 FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR)
 #endif
 
+#ifdef PADDLE_WITH_MLU
+#define DEFINE_MLU_FUNCTOR(type)                                  \
+  template class ConcatFunctor<platform::MLUDeviceContext, type>; \
+  template class SplitFunctor<platform::MLUDeviceContext, type>;
+DEFINE_MLU_FUNCTOR(float)
+DEFINE_MLU_FUNCTOR(platform::float16)
+DEFINE_MLU_FUNCTOR(int64_t)
+DEFINE_MLU_FUNCTOR(bool)
+DEFINE_MLU_FUNCTOR(int)
+DEFINE_MLU_FUNCTOR(int8_t)
+DEFINE_MLU_FUNCTOR(int16_t)
+DEFINE_MLU_FUNCTOR(uint8_t)
+#endif
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc
deleted file mode 100644
index 1b36e615c68df8..00000000000000
--- a/paddle/fluid/operators/math/matrix_inverse.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "Eigen/Core"
-#include "Eigen/LU"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class MatrixInverseFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& a, framework::Tensor* a_inv) {
-    compute_inverse_eigen<platform::CPUDeviceContext, T>(context, a, a_inv);
-  }
-};
-
-template class MatrixInverseFunctor<platform::CPUDeviceContext, float>;
-template class MatrixInverseFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
deleted file mode 100644
index 41335a69417a94..00000000000000
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-class MatrixInverseFunctor;
-
-template <typename T>
-class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& a, framework::Tensor* a_inv) {
-#ifndef PADDLE_WITH_HIP
-    const auto& mat_dims = a.dims();
-    const int rank = mat_dims.size();
-    int n = mat_dims[rank - 1];
-    int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
-
-    memory::allocation::AllocationPtr tmp_gpu_mat_data;
-    const T* gpu_mat = a.data<T>();
-    if (n >= 32) {
-      // Copy all elements of input matrix A to a temporary memory space to
-      // avoid being overriden by getrf.
-      tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T));
-      memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(),
-                   context.GetPlace(), a.data(), a.numel() * sizeof(T),
-                   context.stream());
-      gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
-    }
-
-    std::vector<const T*> cpu_ptrs(batch_size * 2);
-    for (int i = 0; i < batch_size; ++i) {
-      cpu_ptrs[i] = gpu_mat + i * n * n;
-      cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
-    }
-
-    // Copy the addresses of A and A_inv from host to device.
-    memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-        memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
-    memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(),
-                 platform::CPUPlace(), static_cast<void*>(cpu_ptrs.data()),
-                 cpu_ptrs.size() * sizeof(T*), context.stream());
-    T** gpu_inv_ptrs =
-        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-
-    // Allocate device memory for info and pivots.
-    int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
-    memory::allocation::AllocationPtr tmp_gpu_info_data =
-        memory::Alloc(context, num_ints * sizeof(int));
-    int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
-
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
-
-    std::vector<int> info;  // only for singular checking
-    info.resize(batch_size);
-    // This functions in cuBLAS is intended to be used for matrices of small
-    // sizes where the launch overhead is a significant factor.
-    // TODO(Xreki): call function in cusolver for large matrices.
-    if (n < 32) {
-      // cublas<S/D>matinvBatched is a short cut of cublas<S/D>getrfBatched
-      // plus cublas<S/D>getriBatched.
-      // However it only works if N is less than 32. If not, we need to
-      // go through cublas<S/D>getrfBatched and cublas<S/D>getriBatched.
-      blas.BatchedMatInv(n,
-                         reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
-                         gpu_inv_ptrs, gpu_info_ptr, batch_size);
-    } else {
-      // This function performs the LU factorization of each matrix A by the
-      // equation P * A = L * U. L and U are written back to original matrix A,
-      // and diagonal elements of L are discarded.
-      int* gpu_pivot_ptr =
-          reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
-      blas.BatchedGETRF(n, reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
-                        gpu_pivot_ptr, gpu_info_ptr, batch_size);
-
-      blas.BatchedGETRI(n,
-                        reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
-                        gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size);
-    }
-    memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(),
-                 gpu_info_ptr, sizeof(int) * batch_size, context.stream());
-    for (int i = 0; i < batch_size; ++i) {
-      PADDLE_ENFORCE_EQ(info[i], 0,
-                        platform::errors::PreconditionNotMet(
-                            "For batch [%d]: U(%d, %d) is zero, singular U. "
-                            "Please check the matrix value and change it to a "
-                            "non-singular matrix",
-                            i, info[i], info[i]));
-    }
-#else
-    compute_inverse_eigen<platform::CUDADeviceContext, T>(context, a, a_inv);
-#endif
-  }
-};
-
-template class MatrixInverseFunctor<platform::CUDADeviceContext, float>;
-template class MatrixInverseFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc
index 45556e97d1d7af..28ec3a871022f4 100644
--- a/paddle/fluid/operators/math/maxouting.cc
+++ b/paddle/fluid/operators/math/maxouting.cc
@@ -14,106 +14,107 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
 
 // All tensors are in NCHW or NHWC format, and the groups must be greater than 1
-template <typename T>
-class MaxOutFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  const int groups, const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output->dims()[axis];
-    int fea_size = input_height * input_width;
-    // c_size means the output size of each sample
-    int c_size = fea_size * output_channels;
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    for (int i = 0; i < batch_size; ++i) {
-      int new_bindex = c_size * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int new_cindex = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          T ele = static_cast<T>(-FLT_MAX);
-          int input_idx, output_idx;
-          for (int ph = 0; ph < groups; ++ph) {
-            if (axis == 1) {
-              input_idx =
-                  (new_bindex + new_cindex) * groups + ph * fea_size + f;
-            } else {
-              input_idx = (new_bindex + f * output_channels + c) * groups + ph;
-            }
-            T x = input_data[input_idx];
-            ele = ele > x ? ele : x;
-          }
+template <typename DeviceContext, typename T>
+void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                 const framework::Tensor& input,
+                                                 framework::Tensor* output,
+                                                 const int groups,
+                                                 const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output->dims()[axis];
+  int fea_size = input_height * input_width;
+  // c_size means the output size of each sample
+  int c_size = fea_size * output_channels;
+  const T* input_data = input.data<T>();
+  T* output_data = output->mutable_data<T>(context.GetPlace());
+  for (int i = 0; i < batch_size; ++i) {
+    int new_bindex = c_size * i;
+    for (int c = 0; c < output_channels; ++c) {
+      int new_cindex = fea_size * c;
+      for (int f = 0; f < fea_size; ++f) {
+        T ele = static_cast<T>(-FLT_MAX);
+        int input_idx, output_idx;
+        for (int ph = 0; ph < groups; ++ph) {
           if (axis == 1) {
-            output_idx = new_bindex + new_cindex + f;
+            input_idx = (new_bindex + new_cindex) * groups + ph * fea_size + f;
           } else {
-            output_idx = new_bindex + f * output_channels + c;
+            input_idx = (new_bindex + f * output_channels + c) * groups + ph;
           }
-          output_data[output_idx] = ele;
+          T x = input_data[input_idx];
+          ele = ele > x ? ele : x;
         }
+        if (axis == 1) {
+          output_idx = new_bindex + new_cindex + f;
+        } else {
+          output_idx = new_bindex + f * output_channels + c;
+        }
+        output_data[output_idx] = ele;
       }
     }
   }
-};
+}
 
-template <class T>
-class MaxOutGradFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, const int groups,
-                  const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output.dims()[axis];
-    int fea_size = input_height * input_width;
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+template <typename DeviceContext, typename T>
+void MaxOutGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* input_grad, const framework::Tensor& output,
+    const framework::Tensor& output_grad, const int groups, const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output.dims()[axis];
+  int fea_size = input_height * input_width;
+  const T* input_data = input.data<T>();
+  const T* output_data = output.data<T>();
+  const T* output_grad_data = output_grad.data<T>();
+  T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
-    for (int i = 0; i < batch_size; ++i) {
-      int blen = fea_size * output_channels * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int clen = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          int input_idx0, output_idx;
-          bool continue_match = true;
-          if (axis == 1) {
-            input_idx0 = (blen + clen) * groups + f;
-            output_idx = blen + clen + f;
-          } else {
-            input_idx0 = (blen + f * output_channels + c) * groups;
-            output_idx = blen + f * output_channels + c;
-          }
-          for (int g = 0; g < groups && continue_match; ++g) {
-            int idx_offset = (axis == 1 ? fea_size * g : g);
-            int input_idx = input_idx0 + idx_offset;
-            if (input_data[input_idx] == output_data[output_idx]) {
-              input_grad_data[input_idx] += output_grad_data[output_idx];
-              continue_match = false;
-            }
+  for (int i = 0; i < batch_size; ++i) {
+    int blen = fea_size * output_channels * i;
+    for (int c = 0; c < output_channels; ++c) {
+      int clen = fea_size * c;
+      for (int f = 0; f < fea_size; ++f) {
+        int input_idx0, output_idx;
+        bool continue_match = true;
+        if (axis == 1) {
+          input_idx0 = (blen + clen) * groups + f;
+          output_idx = blen + clen + f;
+        } else {
+          input_idx0 = (blen + f * output_channels + c) * groups;
+          output_idx = blen + f * output_channels + c;
+        }
+        for (int g = 0; g < groups && continue_match; ++g) {
+          int idx_offset = (axis == 1 ? fea_size * g : g);
+          int input_idx = input_idx0 + idx_offset;
+          if (input_data[input_idx] == output_data[output_idx]) {
+            input_grad_data[input_idx] += output_grad_data[output_idx];
+            continue_match = false;
           }
         }
       }
     }
   }
-};
+}
 
 template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
 template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
 template class MaxOutFunctor<platform::CPUDeviceContext, float>;
 template class MaxOutFunctor<platform::CPUDeviceContext, double>;
 
+template class MaxOutGradFunctor<phi::CPUContext, float>;
+template class MaxOutGradFunctor<phi::CPUContext, double>;
+template class MaxOutFunctor<phi::CPUContext, float>;
+template class MaxOutFunctor<phi::CPUContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
index 1856fb4eb48c73..1d0478db5ef4a8 100644
--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace paddle {
 namespace operators {
@@ -95,61 +96,57 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
 /*
  * All tensors are in NCHW or NHWC format.
  */
-template <typename T>
-class MaxOutFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  const int groups, const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[axis];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output->dims()[axis];
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads = output->numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, input_channels, input_height, input_width, groups,
-        axis, output_data);
-  }
-};
+template <typename DeviceContext, typename T>
+void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                 const framework::Tensor& input,
+                                                 framework::Tensor* output,
+                                                 const int groups,
+                                                 const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_channels = input.dims()[axis];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output->dims()[axis];
+
+  const T* input_data = input.data<T>();
+  T* output_data = output->mutable_data<T>(context.GetPlace());
+  int nthreads = output->numel();
+  int blocks = (nthreads + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
+      nthreads, input_data, input_channels, input_height, input_width, groups,
+      axis, output_data);
+}
+
 /*
  * All tensors are in NCHW or NHWC format.
  */
-template <typename T>
-class MaxOutGradFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, const int groups,
-                  const int axis) {
-    const int batch_size = input.dims()[0];
-    const int input_channels = input.dims()[axis];
-    const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
-    const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
-    const int output_channels = output.dims()[axis];
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads = output.numel();
-    int blocks = (nthreads + 1024 - 1) / 1024;
-    dim3 threads(1024, 1);
-    dim3 grid(blocks, 1);
-
-    KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, groups, axis);
-  }
-};
+template <typename DeviceContext, typename T>
+void MaxOutGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* input_grad, const framework::Tensor& output,
+    const framework::Tensor& output_grad, const int groups, const int axis) {
+  const int batch_size = input.dims()[0];
+  const int input_channels = input.dims()[axis];
+  const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]);
+  const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);
+  const int output_channels = output.dims()[axis];
+
+  const T* input_data = input.data<T>();
+  const T* output_data = output.data<T>();
+  const T* output_grad_data = output_grad.data<T>();
+  T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+  int nthreads = output.numel();
+  int blocks = (nthreads + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
+      nthreads, input_data, output_data, output_grad_data, input_grad_data,
+      input_channels, input_height, input_width, groups, axis);
+}
 
 template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
@@ -157,6 +154,12 @@ template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
 template class MaxOutFunctor<platform::CUDADeviceContext, float>;
 template class MaxOutFunctor<platform::CUDADeviceContext, double>;
 
+template class MaxOutGradFunctor<phi::GPUContext, float>;
+template class MaxOutGradFunctor<phi::GPUContext, double>;
+
+template class MaxOutFunctor<phi::GPUContext, float>;
+template class MaxOutFunctor<phi::GPUContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
index 0d8372df8a2fec..1f4964f7715426 100644
--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -30,7 +30,7 @@ class MaxOutFunctor {
                   const int axis = 1);
 };
 
-template <typename DeviceContext, class T>
+template <typename DeviceContext, typename T>
 class MaxOutGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 788dbb2204109d..01fa01e3c6ed04 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -524,8 +524,8 @@ REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker,
                   ops::MatMulV2GradOpMaker<paddle::framework::OpDesc>,
                   ops::MatMulV2GradOpMaker<paddle::imperative::OpBase>);
 
-DELCARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor,
-                            PT_INFER_META(phi::GeneralBinaryGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralBinaryGradInferMeta));
 REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad,
                   ops::MatMulV2OpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MatMulV2OpDoubleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc
index c65af3129f3646..cdf204628b638f 100644
--- a/paddle/fluid/operators/matrix_power_op.cc
+++ b/paddle/fluid/operators/matrix_power_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/matrix_power_op.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 
 namespace paddle {
 namespace operators {
@@ -119,13 +122,3 @@ REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker,
                   ops::MatrixPowerGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    matrix_power,
-    ops::MatrixPowerKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatrixPowerKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    matrix_power_grad,
-    ops::MatrixPowerGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatrixPowerGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h
deleted file mode 100644
index d2c67d80b4f5a5..00000000000000
--- a/paddle/fluid/operators/matrix_power_op.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-struct IdentityMatrixFunctor {
-  IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int row = index / m_ % m_;
-    const int col = index % m_;
-    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
-  }
-
-  const int m_;
-  T* output_;
-};
-
-template <typename DeviceContext, typename T>
-void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out,
-                         const paddle::framework::ExecutionContext& ctx) {
-  const auto& x_dims = X->dims();
-  const int x_ndim = x_dims.size();
-  T* out_data = Out->mutable_data<T>(ctx.GetPlace());
-
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  platform::ForRange<DeviceContext> for_range(dev_ctx, X->numel());
-
-  if (n == 0) {
-    // Out = Identity Matrix
-    IdentityMatrixFunctor<T> functor(x_dims[x_ndim - 1], out_data);
-    for_range(functor);
-    return;
-  }
-
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-  Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  int new_n = n;
-  if (n > 0) {
-    // newX = X
-    framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
-  } else {
-    // newX = X^{-1}, n = -n
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *X, &new_x);
-    new_n = -n;
-  }
-
-  if (new_n == 1) {
-    framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, Out);
-    return;
-  }
-
-  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
-
-  if (new_n == 2) {
-    // Out = newX * newX
-    Out->mutable_data<T>(ctx.GetPlace());
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  } else if (new_n == 3) {
-    // Out = (newX * newX) * newX
-    // Note: C[i] matrices in MatMul must not overlap, i.e. the individual
-    // gemm operations must be computable independently; otherwise,
-    // undefined behavior is expected.
-    Tensor temp = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                &temp, static_cast<T>(0));
-    blas.MatMul(temp, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  } else if (new_n == 4) {
-    // Out = (newX * newX) * (newX * newX)
-    Tensor temp = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast<T>(1),
-                &temp, static_cast<T>(0));
-    blas.MatMul(temp, no_trans_desc, temp, no_trans_desc, static_cast<T>(1),
-                Out, static_cast<T>(0));
-    return;
-  }
-
-  // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN)
-  int bit = 0;
-  Tensor z = Tensor(X->dtype());
-  bool out_inited = false;
-  Tensor temp_out = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  Tensor temp_z = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  while (new_n > 0) {
-    bit = new_n & 0x1;
-    new_n >>= 1;
-    if (z.IsInitialized()) {
-      blas.MatMul(z, no_trans_desc, z, no_trans_desc, static_cast<T>(1),
-                  &temp_z, static_cast<T>(0));
-      framework::TensorCopy(temp_z, ctx.GetPlace(), dev_ctx, &z);
-    } else {
-      z = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-      framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, &z);
-    }
-    if (bit == 1) {
-      if (out_inited == true) {
-        blas.MatMul(*Out, no_trans_desc, z, no_trans_desc, static_cast<T>(1),
-                    &temp_out, static_cast<T>(0));
-        framework::TensorCopy(temp_out, ctx.GetPlace(), dev_ctx, Out);
-      } else {
-        framework::TensorCopy(z, ctx.GetPlace(), dev_ctx, Out);
-        out_inited = true;
-      }
-    }
-  }
-  return;
-}
-
-template <typename DeviceContext, typename T>
-class MatrixPowerKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    const Tensor* X = ctx.Input<Tensor>("X");
-    Tensor* Out = ctx.Output<Tensor>("Out");
-    int n = ctx.Attr<int>("n");
-
-    const auto& x_dims = X->dims();
-    const int x_ndim = x_dims.size();
-    PADDLE_ENFORCE_EQ(
-        x_dims[x_ndim - 2], x_dims[x_ndim - 1],
-        platform::errors::InvalidArgument(
-            "The inner-most 2 dimensions of Input(X) should be equal."
-            "X's shape[-2] = %d and shape[-1] = %d.",
-            x_dims[x_ndim - 2], x_dims[x_ndim - 1]));
-
-    MatrixPowerFunction<DeviceContext, T>(X, n, Out, ctx);
-  }
-};
-
-template <typename DeviceContext, typename T>
-void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
-                             const Tensor* dOut, const int n, Tensor* dX,
-                             const paddle::framework::ExecutionContext& ctx) {
-  dX->mutable_data<T>(ctx.GetPlace());
-  const auto& x_dims = X->dims();
-
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-  if (n == 0) {
-    // \nabla X = O
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    zero(dev_ctx, dX, static_cast<T>(0));
-    return;
-  } else if (n == 1) {
-    // \nabla X = \nabla Out
-    framework::TensorCopy(*dOut, ctx.GetPlace(), dev_ctx, dX);
-    return;
-  }
-
-  auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true);
-  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
-
-  if (n == -1) {
-    // \nabla X = Out^{T} * \nabla Out * Out^{T}
-    Tensor temp_dx =
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(*Out, trans_desc, *dOut, no_trans_desc, static_cast<T>(-1),
-                &temp_dx, static_cast<T>(0));
-    blas.MatMul(temp_dx, no_trans_desc, *Out, trans_desc, static_cast<T>(1), dX,
-                static_cast<T>(0));
-    return;
-  }
-
-  Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  int new_n = n;
-  if (n > 0) {
-    // newX = X
-    framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x);
-  } else {
-    // newX = X^{-1}, n = -n
-    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(dev_ctx, *X, &new_x);
-    new_n = -n;
-  }
-
-  // Use chain rule blow to compute \nabla newX^{n}
-  // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1},
-  // Note that newX^{0} can be omitted
-  std::vector<std::shared_ptr<Tensor>> tensor_list(new_n - 1);
-  tensor_list[0] = std::make_shared<Tensor>(new_x);
-  int index = 1;
-  while (index < new_n - 1) {
-    tensor_list[index] = std::make_shared<Tensor>(
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx));
-    blas.MatMul(*tensor_list[index - 1], no_trans_desc, new_x, no_trans_desc,
-                static_cast<T>(1), tensor_list[index].get(), static_cast<T>(0));
-    index++;
-  }
-
-  // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i}
-  //                      * \nabla Out
-  //                      * (newX^{T}^{n - i - 1})
-  Tensor dx_new = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  blas.MatMul(*tensor_list[new_n - 2], trans_desc, *dOut, no_trans_desc,
-              static_cast<T>(1), &dx_new, static_cast<T>(0));
-  Tensor da_an_minus1 =
-      ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-  blas.MatMul(*dOut, no_trans_desc, *tensor_list[new_n - 2], trans_desc,
-              static_cast<T>(1), &da_an_minus1, static_cast<T>(0));
-  blas.AXPY(X->numel(), static_cast<T>(1), da_an_minus1.data<T>(),
-            dx_new.data<T>());
-  int start = 0;
-  while (start < new_n - 2) {
-    Tensor a_da = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    Tensor a_da_a = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(*tensor_list[start], trans_desc, *dOut, no_trans_desc,
-                static_cast<T>(1), &a_da, static_cast<T>(0));
-    blas.MatMul(a_da, no_trans_desc, *tensor_list[new_n - 3 - start],
-                trans_desc, static_cast<T>(1), &a_da_a, static_cast<T>(0));
-    blas.AXPY(X->numel(), static_cast<T>(1), a_da_a.data<T>(),
-              dx_new.data<T>());
-    start++;
-  }
-
-  if (n > 0) {
-    // \nabla X = \nabla newX
-    framework::TensorCopy(dx_new, ctx.GetPlace(), dev_ctx, dX);
-  } else {
-    // \nabla X = newX^{T} * \nabla newX * newX^{T}
-    Tensor temp_dx =
-        ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
-    blas.MatMul(new_x, trans_desc, dx_new, no_trans_desc, static_cast<T>(-1),
-                &temp_dx, static_cast<T>(0));
-    blas.MatMul(temp_dx, no_trans_desc, new_x, trans_desc, static_cast<T>(1),
-                dX, static_cast<T>(0));
-  }
-  return;
-}
-
-template <typename DeviceContext, typename T>
-class MatrixPowerGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* X = ctx.Input<Tensor>("X");
-    const Tensor* Out = ctx.Input<Tensor>("Out");
-    const Tensor* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const int n = ctx.Attr<int>("n");
-    Tensor* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    MatrixPowerGradFunction<DeviceContext, T>(X, Out, dOut, n, dX, ctx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index bd9ebd29777def..e55369e0691ee5 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -12,14 +12,14 @@
  *     See the License for the specific language governing permissions and
  *     limitations under the License. */
 
-#include "paddle/fluid/operators/maxout_op.h"
 #include <vector>
 
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -130,10 +130,3 @@ REGISTER_OPERATOR(
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
     paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
 REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MaxOutKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
deleted file mode 100644
index 922998293943ed..00000000000000
--- a/paddle/fluid/operators/maxout_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/maxouting.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class MaxOutKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    int groups = context.template Attr<int>("groups");
-    int axis = context.template Attr<int>("axis");
-    if (axis < 0) {
-      axis += in_x->dims().size();
-    }
-
-    math::MaxOutFunctor<DeviceContext, T> maxout_forward;
-    maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
-                   groups, axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MaxOutGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* in_x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-    const Tensor* out_grad =
-        context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int groups = context.template Attr<int>("groups");
-    int axis = context.template Attr<int>("axis");
-    if (axis < 0) {
-      axis += in_x->dims().size();
-    }
-
-    auto& device_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
-      math::MaxOutGradFunctor<DeviceContext, T> maxout_backward;
-      maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups,
-                      axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 3692ace8bb5a46..056620db5b9669 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -123,13 +123,10 @@ with the input Out(Inference).
 }  // namespace operators
 }  // namespace paddle
 
+// FIXME(typhoonzero): types of T is for infernece data.
+// label data is always int.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
     accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-// FIXME(typhoonzero): types of T is for infernece data.
-// label data is always int.
-REGISTER_OP_CPU_KERNEL(accuracy,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu
deleted file mode 100644
index 6f19100fa9d37e..00000000000000
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thrust/execution_policy.h>
-#include <thrust/reduce.h>
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <int BlockSize>
-__global__ void AccuracyCudaKernel(const int N, const int D,
-                                   const int64_t* Xdata,
-                                   const int64_t* labeldata, int* correct_data,
-                                   float* accuracy, int* total_data) {
-  int count = 0;
-  __shared__ int total[BlockSize];
-
-  // support only 1 block
-  for (int i = threadIdx.x; i < (N); i += BlockSize) {
-    for (int j = 0; j < D; ++j) {
-      if (Xdata[i * D + j] == labeldata[i]) {
-        ++count;
-        break;
-      }
-    }
-  }
-  total[threadIdx.x] = count;
-  __syncthreads();
-
-// reduce the count with init value 0, and output accuracy.
-#ifdef PADDLE_WITH_CUDA
-  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
-#else
-  // HIP thrust::reduce not support __device__
-  for (int s = BlockSize / 2; s > 0; s >>= 1) {
-    if (threadIdx.x < s) {
-      total[threadIdx.x] += total[threadIdx.x + s];
-    }
-    __syncthreads();
-  }
-  int result = total[0];
-#endif
-  if (threadIdx.x == 0) {
-    *correct_data = result;
-    *accuracy = static_cast<float>(result) / static_cast<float>(N);
-    *total_data = N;
-  }
-}
-
-template <typename T>
-class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-    // FIXME(typhoonzero): only support indices currently
-    // if add support for output values, how to detect the data type?
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    int num_samples = static_cast<int>(inference->dims()[0]);
-    size_t infer_width = inference->dims()[1];
-    auto stream = ctx.cuda_device_context().stream();
-    platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    AccuracyCudaKernel<
-        PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        num_samples, infer_width, indices_data, label_data, correct_data,
-        accuracy_data, total_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-// FIXME(typhoonzero): types of T is for inference data.
-// label data is always int64
-REGISTER_OP_CUDA_KERNEL(
-    accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-    paddle::operators::AccuracyOpCUDAKernel<double>,
-    paddle::operators::AccuracyOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.h b/paddle/fluid/operators/metrics/accuracy_op.h
deleted file mode 100644
index 94e5bf8257e67b..00000000000000
--- a/paddle/fluid/operators/metrics/accuracy_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class AccuracyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
-
-    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
-    int* total_data = total->mutable_data<int>(ctx.GetPlace());
-    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
-
-    const int64_t* indices_data = indices->data<int64_t>();
-    const int64_t* label_data = label->data<int64_t>();
-
-    size_t num_samples = inference->dims()[0];
-    size_t class_dim = inference->dims()[1];
-    *accuracy_data = 0.0f;
-
-    if (num_samples == 0) {
-      return;
-    }
-
-    int num_correct = 0;
-    // assume inference is already the topk of the output
-    for (size_t i = 0; i < num_samples; ++i) {
-      PADDLE_ENFORCE_GE(
-          label_data[i], 0,
-          platform::errors::InvalidArgument(
-              "label of AccuracyOp must >= 0, But received label[%d] is %d", i,
-              label_data[i]));
-      for (size_t j = 0; j < class_dim; ++j) {
-        if (indices_data[i * class_dim + j] == label_data[i]) {
-          ++num_correct;
-          break;
-        }
-      }
-    }
-
-    *correct_data = num_correct;
-    *total_data = num_samples;
-    *accuracy_data =
-        static_cast<float>(num_correct) / static_cast<float>(num_samples);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
index 2598d3b0277c94..1ce02ff4525c96 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
index e83278f88b82a3..9f2ca4165f33a2 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index de71312d78df99..3cc1be4de8a82f 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -14,12 +14,14 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = paddle::framework::Tensor;
 template <typename DeviceContext, typename T>
 class AccuracyXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc
index 54ecba08a82dce..f3ed98c3f4d1e4 100644
--- a/paddle/fluid/operators/metrics/auc_op.cc
+++ b/paddle/fluid/operators/metrics/auc_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,70 +24,6 @@ class AucOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Predict"), "Input", "Predict", "Auc");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Auc");
-    auto predict_dims = ctx->GetInputDim("Predict");
-    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_GE(
-        predict_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The Input(Predict) has not been initialized properly. The "
-            "shape of Input(Predict) = [%s], the shape size must be "
-            "greater_equal 2.",
-            predict_dims));
-    auto predict_width = predict_dims[1];
-    PADDLE_ENFORCE_NE(
-        phi::product(predict_dims), 0,
-        platform::errors::InvalidArgument(
-            "The Input(Predict) has not been initialized properly. The "
-            "shape of Input(Predict) = [%s], the shape can not involes 0.",
-            predict_dims));
-    PADDLE_ENFORCE_NE(
-        phi::product(label_dims), 0,
-        platform::errors::InvalidArgument(
-            "The Input(Label) has not been initialized properly. The "
-            "shape of Input(Label) = [%s], the shape can not involes 0.",
-            label_dims));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_LE(predict_width, 2,
-                        platform::errors::InvalidArgument(
-                            "Only support binary classification,"
-                            "prediction dims[1] should be 1 or 2"));
-    }
-    auto predict_height = ctx->GetInputDim("Predict")[0];
-    auto label_height = ctx->GetInputDim("Label")[0];
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(predict_height, label_height,
-                        platform::errors::InvalidArgument(
-                            "Out and Label should have same height."));
-    }
-
-    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
-    int slide_steps = ctx->Attrs().Get<int>("slide_steps");
-
-    PADDLE_ENFORCE_GE(
-        num_pred_buckets, 1,
-        platform::errors::InvalidArgument("num_thresholds must larger than 1"));
-    PADDLE_ENFORCE_GE(slide_steps, 0,
-                      platform::errors::InvalidArgument(
-                          "slide_steps must be natural number"));
-
-    ctx->SetOutputDim("AUC", {1});
-
-    if (slide_steps) {
-      ctx->SetOutputDim("StatPosOut",
-                        {(1 + slide_steps) * num_pred_buckets + 1});
-      ctx->SetOutputDim("StatNegOut",
-                        {(1 + slide_steps) * num_pred_buckets + 1});
-    } else {
-      ctx->SetOutputDim("StatPosOut", {1, num_pred_buckets});
-      ctx->SetOutputDim("StatNegOut", {1, num_pred_buckets});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -145,4 +84,7 @@ There are two types of possible curves:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker);
+DECLARE_INFER_SHAPE_FUNCTOR(auc, AucInferShapeFunctor,
+                            PD_INFER_META(phi::AucInferMeta));
+REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker,
+                             AucInferShapeFunctor);
diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
index 780c6e7f153e7b..a3b764b0e1c46a 100644
--- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
@@ -13,19 +13,32 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/shape_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = phi::SelectedRows;
 
 template <typename T>
-class ShapeMKLDNNKernel : public ShapeKernel<T> {
+class ShapeMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ShapeKernel<T>::Compute(ctx);
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<phi::SelectedRows>()) {
+      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
 
     auto* out = ctx.Output<Tensor>("Out");
     out->set_layout(framework::DataLayout::kMKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index 05cd264cf3ec9e..23428dd403e9b1 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -29,7 +29,7 @@ USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(elementwise_mul);
 USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
index c776cf2a7c792c..e9dadd5ec937cd 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -27,7 +27,7 @@
 
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 3791fed23a84ff..916f02179b364b 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -27,7 +27,7 @@
 
 USE_OP(pool2d);
 USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
index 884521301750ce..6e3bd5e43c9c1d 100644
--- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace fw = paddle::framework;
 namespace plat = paddle::platform;
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MLU);
 
 // relu
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 9de03582cbbf53..1fdaa153e3c27e 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -499,6 +499,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                         output_desc, output));
 }
 
+/* static */ void MLUCnnl::Concat(const MLUDeviceContext& dev_ctx,
+                                  const int pack_num, const int axis,
+                                  const cnnlTensorDescriptor_t inputs_desc[],
+                                  const void* const inputs[],
+                                  const cnnlTensorDescriptor_t output_desc,
+                                  void* output) {
+  cnnlHandle_t handle = dev_ctx.cnnl_handle();
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size));
+
+  Tensor workspace(paddle::experimental::DataType::INT8);
+  workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
+  void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlConcat(handle, pack_num, axis, inputs_desc,
+                                        inputs, workspace_ptr, workspace_size,
+                                        output_desc, output));
+}
+
 /* static */ void MLUCnnl::Div(
     const ExecutionContext& ctx, cnnlComputationPreference_t prefer,
     const cnnlTensorDescriptor_t in0_desc, const void* in0,
@@ -977,6 +998,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                        output_descs, output_ptrs));
 }
 
+/* static */ void MLUCnnl::Split(const MLUDeviceContext& dev_ctx, int split_num,
+                                 int axis,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input_ptr,
+                                 const cnnlTensorDescriptor_t output_descs[],
+                                 void* output_ptrs[]) {
+  cnnlHandle_t handle = dev_ctx.cnnl_handle();
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size));
+
+  Tensor workspace(paddle::experimental::DataType::INT8);
+  workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
+  void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSplit(handle, split_num, axis, input_desc,
+                                       input_ptr, workspace_ptr, workspace_size,
+                                       output_descs, output_ptrs));
+}
+
 /* static */ void MLUCnnl::GatherFunctor(
     const ExecutionContext& ctx, const int axis, const int batch_dims,
     const cnnlTensorDescriptor_t params_desc, const void* params,
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 2a54a8392c7c5b..b55b10686e92e2 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -403,6 +403,11 @@ class MLUCnnl {
                      const void* const inputs[],
                      const cnnlTensorDescriptor_t output_desc, void* output);
 
+  static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num,
+                     const int axis, const cnnlTensorDescriptor_t inputs_desc[],
+                     const void* const inputs[],
+                     const cnnlTensorDescriptor_t output_desc, void* output);
+
   static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type,
                    const cnnlTensorDescriptor_t input_desc, const void* input,
                    const cnnlTensorDescriptor_t output_desc, void* output);
@@ -566,6 +571,12 @@ class MLUCnnl {
                     const cnnlTensorDescriptor_t output_descs[],
                     void* output_ptrs[]);
 
+  static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis,
+                    const cnnlTensorDescriptor_t input_desc,
+                    const void* input_ptr,
+                    const cnnlTensorDescriptor_t output_descs[],
+                    void* output_ptrs[]);
+
   static void Scale(const ExecutionContext& ctx, const int axis,
                     const cnnlTensorDescriptor_t input_desc, const void* input,
                     const cnnlTensorDescriptor_t alpha_desc, const void* alpha,
diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu
index afb949d3374c62..2bacda8afb0eb3 100644
--- a/paddle/fluid/operators/mode_op.cu
+++ b/paddle/fluid/operators/mode_op.cu
@@ -24,7 +24,6 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mode_op.h"
 #include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index fe4609b3ad91e7..b309e1b87ef903 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -87,135 +87,6 @@ inline framework::DDim ComputeAndCheckShape(
   return out_dim;
 }
 
-template <typename DeviceContext, typename T>
-inline framework::Tensor MatMul(const framework::ExecutionContext& ctx,
-                                const framework::Tensor& matrix_a,
-                                const framework::Tensor& matrix_b,
-                                const framework::DDim& a_dim,
-                                const framework::DDim& b_dim) {
-  auto place = ctx.GetPlace();
-  auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-  framework::Tensor matrix_c;
-  framework::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]});
-  matrix_c.Resize(c_dim);
-  matrix_c.mutable_data<T>(place);
-
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false);
-  const T alpha = static_cast<T>(1.0);
-  blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0));
-  return matrix_c;
-}
-
-/**
- * @brief Recursively calculate matrix multiplication according to the optimal
- * order
- * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j]
- *
- * @param
- * ins: the input tensors
- * ins_dims: the shape of ins after reshape
- * order: the optimal order
- * i: the left of sub chain
- * j: the righe of sub chain
- * save_result: set true by backward
- * results: save the intermediate result during backward
- */
-template <typename DeviceContext, typename T>
-inline framework::Tensor MatChainMul(
-    const framework::ExecutionContext& ctx,
-    const std::vector<const framework::Tensor*>& ins,
-    const std::vector<framework::DDim>& ins_dims,
-    const std::vector<uint64_t>& order, const uint64_t i, const uint64_t j,
-    const bool save_result, std::vector<framework::Tensor>* results) {
-  if (i == j) {
-    return *ins[i];
-  }
-
-  const auto A = MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, i,
-                                               order[i * ins.size() + j],
-                                               save_result, results);
-  framework::DDim a_dim = A.dims();
-  if (i == order[i * ins.size() + j]) {
-    a_dim = ins_dims[i];
-  }
-
-  const auto B = MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order,
-                                               order[i * ins.size() + j] + 1, j,
-                                               save_result, results);
-  framework::DDim b_dim = B.dims();
-  if (j == order[i * ins.size() + j] + 1) {
-    b_dim = ins_dims[j];
-  }
-
-  auto result = MatMul<DeviceContext, T>(ctx, A, B, a_dim, b_dim);
-  if (save_result) {
-    (*results)[i * ins.size() + j] = result;
-  }
-  return result;
-}
-
-/**
- * @brief get the optimal order
- */
-std::vector<uint64_t> GetOrder(const std::vector<const framework::Tensor*>& ins,
-                               const std::vector<framework::DDim>& ins_dims) {
-  auto n = ins.size();
-  // p: save the ins shape, the ins[i] shape is (p[i], p[i+1])
-  std::vector<uint64_t> p(n + 1);
-  for (uint64_t i = 0; i < n; i++) {
-    p[i] = ins_dims[i][0];
-  }
-  p[n] = ins_dims[n - 1][1];
-
-  // m[i, j]: save the lowest cost for multiplying ins[i...j]
-  std::vector<uint64_t> m(n * n, 0);
-  // define ins[i...j] means multiplying matrices from ins[i] to ins[j]
-  // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then
-  // multiply the resulting matrices is the optimal order for ins[i...j]
-  std::vector<uint64_t> order(n * n);
-  for (uint64_t l = 1; l < n; l++) {
-    for (uint64_t i = 0; i < n - l; i++) {
-      auto j = i + l;
-      m[i * n + j] = 0xffffffff;
-      for (uint64_t k = i; k < j; k++) {
-        uint64_t q =
-            m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1];
-        if (q < m[i * n + j]) {
-          m[i * n + j] = q;
-          order[i * n + j] = k;
-        }
-      }
-    }
-  }
-  return order;
-}
-
-template <typename DeviceContext, typename T>
-static inline framework::Tensor MultiDotMatChainOrder(
-    const framework::ExecutionContext& ctx,
-    const std::vector<const framework::Tensor*>& ins,
-    const std::vector<framework::DDim>& ins_dims, const bool save_result,
-    std::vector<framework::Tensor>* results) {
-  auto order = GetOrder(ins, ins_dims);
-  return MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0,
-                                       ins.size() - 1, save_result, results);
-}
-
-inline void GetDims(const std::vector<const framework::Tensor*>& ins,
-                    std::vector<framework::DDim>* ins_dims) {
-  const auto n = ins.size();
-  for (size_t i = 0; i < n; i++) {
-    (*ins_dims)[i] = ins[i]->dims();
-    if (i == 0 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]});
-    } else if (i == n - 1 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1});
-    }
-  }
-}
-
 class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -252,78 +123,6 @@ class MultiDotOp : public framework::OperatorWithKernel {
   }
 };
 
-/**
- * 1. there are only 2 matrices: direct matrix multiplication A*B
- * 2. there are only 3 matrices: calculate the cost of (A*B)*C and A*(B*C),
- *  choose the least cost order for calculation
- * 3. more than 3 matrices: call MultiDotMatChainOrder
- */
-template <typename DeviceContext, typename T>
-class MultiDotKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-    auto n = ins.size();
-    std::vector<framework::DDim> ins_dims(n);
-    GetDims(ins, &ins_dims);
-
-    const T scale = static_cast<T>(1.0);
-    if (n == 2) {
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
-    } else if (n == 3) {
-      const auto Ma = ins_dims[0][0];
-      const auto Ka = ins_dims[0][1];
-      const auto Nb = ins_dims[1][1];
-      const auto Nc = ins_dims[2][1];
-      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
-      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
-      if (cost1 < cost2) {
-        framework::Tensor tmp_out;
-        tmp_out.mutable_data<T>(place, Ma * Nb * sizeof(T));
-        framework::DDim tmp_dim = phi::make_ddim({Ma, Nb});
-        blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out,
-                    T(0));
-        auto mat_dim_tmp =
-            phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
-        blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
-      } else {
-        framework::Tensor tmp_out;
-        tmp_out.mutable_data<T>(place, Ka * Nc * sizeof(T));
-        framework::DDim tmp_dim = phi::make_ddim({Ka, Nc});
-        blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out,
-                    T(0));
-        auto mat_dim_tmp =
-            phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
-        blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
-      }
-    } else {
-      std::vector<framework::Tensor> results;
-      const auto tmp = MultiDotMatChainOrder<DeviceContext, T>(
-          ctx, ins, ins_dims, false, &results);
-      auto out_dim = out->dims();
-      *out = tmp;
-      out->Resize(out_dim);
-    }
-  }
-};
-
 class MultiDotOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -341,180 +140,6 @@ class MultiDotOpGrad : public framework::OperatorWithKernel {
   }
 };
 
-template <typename DeviceContext, typename T>
-class MultiDotGradKernel : public framework::OpKernel<T> {
- public:
-  /**
-   * @brief calculate dA and dB
-   * dA = dout * transpose(B)
-   * dB = transpose(A) * dout
-   */
-  void CalcGrad(const framework::ExecutionContext& ctx,
-                const framework::Tensor& dout, const framework::Tensor& A,
-                const framework::Tensor& B, const framework::DDim& dout_dim,
-                const framework::DDim& a_dim, const framework::DDim& b_dim,
-                framework::Tensor* dA, framework::Tensor* dB) const {
-    auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
-    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true);
-    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true);
-    T alpha = static_cast<T>(1.0);
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-    blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
-    blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
-  }
-
-  /**
-   * @brief calculate multi matrix multiplication grad by a chain order
-   * @param
-   * dout: the grad of multi matrix multiplication out
-   * dx: the out grad of inputs
-   * ins: the input tensors
-   * ins_dims: the shape of ins after reshape
-   * order: the optimal order
-   * i: the left of sub chain
-   * j: the righe of sub chain
-   * results: the intermediate result of farward
-   */
-  void MatChainMulGrad(const framework::ExecutionContext& ctx,
-                       const framework::Tensor& dout,
-                       std::vector<framework::Tensor*>* dx,
-                       const std::vector<const framework::Tensor*>& ins,
-                       const framework::DDim& dout_dim,
-                       const std::vector<framework::DDim>& ins_dims,
-                       const std::vector<uint64_t>& order, const uint64_t i,
-                       const uint64_t j,
-                       const std::vector<framework::Tensor>& results) const {
-    if (i == j) {
-      *((*dx)[i]) = dout;
-      return;
-    }
-
-    const auto n = ins.size();
-    const auto right = order[i * n + j];
-    const auto left = order[i * n + j] + 1;
-    // get the multi result of left sub chain
-    const auto* A = &results[i * n + right];
-    framework::DDim a_dim = A->dims();
-    if (i == right) {
-      A = ins[i];
-      a_dim = ins_dims[i];
-    }
-    // get the multi result of right sub chain
-    const auto* B = &results[left * n + j];
-    framework::DDim b_dim = B->dims();
-    if (left == j) {
-      B = ins[j];
-      b_dim = ins_dims[j];
-    }
-    framework::Tensor dA, dB;
-    dA.Resize({dout_dim[0], b_dim[0]});
-    dB.Resize({a_dim[1], dout_dim[1]});
-    dA.mutable_data<T>(ctx.GetPlace());
-    dB.mutable_data<T>(ctx.GetPlace());
-
-    CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB);
-    MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right,
-                    results);
-    MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j,
-                    results);
-  }
-
-  void MultiDotGradMatChainOrder(
-      const framework::ExecutionContext& ctx, const framework::Tensor& dout,
-      const std::vector<const framework::Tensor*>& ins,
-      const framework::DDim& dout_dim,
-      const std::vector<framework::DDim>& ins_dims,
-      std::vector<framework::Tensor*>* dx) const {
-    auto order = GetOrder(ins, ins_dims);
-    auto n = ins.size();
-    std::vector<framework::Tensor> results(n * n);
-    MatChainMul<DeviceContext, T>(ctx, ins, ins_dims, order, 0, n - 1, true,
-                                  &results);
-    MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1,
-                    results);
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto dout = *ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto dx = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-    auto place = ctx.GetPlace();
-
-    const auto n = ins.size();
-    for (size_t i = 0; i < n; i++) {
-      dx[i]->mutable_data<T>(place);
-    }
-
-    std::vector<framework::DDim> ins_dims(n);
-    GetDims(ins, &ins_dims);
-
-    framework::DDim dout_dim = dout.dims();
-    if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) {
-      dout_dim = phi::make_ddim({1, 1});
-    } else if (ins[0]->dims().size() == 1) {
-      if (dout_dim.size() == 1) {
-        dout_dim = phi::make_ddim({1, dout_dim[0]});
-      }
-    } else if (ins[n - 1]->dims().size() == 1) {
-      if (dout_dim.size() == 1) {
-        dout_dim = phi::make_ddim({dout_dim[0], 1});
-      }
-    }
-
-    T alpha = static_cast<T>(1);
-    auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
-    if (n == 2) {
-      CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1],
-               dx[0], dx[1]);
-    } else if (n == 3) {
-      const auto Ma = ins_dims[0][0];
-      const auto Ka = ins_dims[0][1];
-      const auto Nb = ins_dims[1][1];
-      const auto Nc = ins_dims[2][1];
-      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
-      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-      auto mat_dim_c =
-          phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
-      if (cost1 < cost2) {
-        framework::Tensor tmp_out, tmp_dout;
-        tmp_out.Resize({Ma, Nb});
-        tmp_out.mutable_data<T>(place);
-        tmp_dout.Resize({mat_dim_dout.height_, Nb});
-        tmp_dout.mutable_data<T>(place);
-        blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out,
-                    T(0));
-        CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(),
-                 ins_dims[2], &tmp_dout, dx[2]);
-        CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0],
-                 ins_dims[1], dx[0], dx[1]);
-      } else {
-        framework::Tensor tmp_out, tmp_dout;
-        tmp_out.Resize({Ka, Nc});
-        tmp_out.mutable_data<T>(place);
-        tmp_dout.Resize({Ka, mat_dim_dout.width_});
-        tmp_dout.mutable_data<T>(place);
-        blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out,
-                    T(0));
-        CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0],
-                 tmp_dout.dims(), dx[0], &tmp_dout);
-        CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1],
-                 ins_dims[2], dx[1], dx[2]);
-      }
-    } else {
-      MultiDotGradMatChainOrder(ctx, dout, ins, dout_dim, ins_dims, &dx);
-      if (ins[n - 1]->dims().size() == 1) {
-        dx[n - 1]->Resize({dx[n - 1]->dims()[0]});
-      }
-    }
-  }
-};
-
 template <typename T>
 class MultiDotOpGradMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -552,25 +177,3 @@ REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker,
 REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad,
                   ops::MultiDotOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MultiDotOpDoubleGradMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    multi_dot, ops::MultiDotKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiDotKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    multi_dot_grad,
-    ops::MultiDotGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiDotGradKernel<paddle::platform::CPUDeviceContext, float>);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    multi_dot, ops::MultiDotKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiDotKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiDotKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    multi_dot_grad,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiDotGradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::float16>);
-#endif
diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc
index 1143f9cb37aa54..0113f638b9a47d 100644
--- a/paddle/fluid/operators/multinomial_op.cc
+++ b/paddle/fluid/operators/multinomial_op.cc
@@ -53,8 +53,8 @@ class MultinomialOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-DELCARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor,
-                            PT_INFER_META(phi::MultinomialInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor,
+                            PD_INFER_META(phi::MultinomialInferMeta));
 REGISTER_OPERATOR(
     multinomial, ops::MultinomialOp, ops::MultinomialOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc
index ab9f10070fc60d..bf7222fc45c660 100644
--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
@@ -16,8 +16,11 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -42,33 +45,6 @@ class MVOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv");
-    OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv");
-    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
-
-    auto dim_x = context->GetInputDim("X");
-    auto dim_vec = context->GetInputDim("Vec");
-    PADDLE_ENFORCE_EQ(
-        dim_x.size(), 2,
-        platform::errors::InvalidArgument(
-            "The rank of input X should be 2, but is %d", dim_x.size()));
-    PADDLE_ENFORCE_EQ(
-        dim_vec.size(), 1,
-        platform::errors::InvalidArgument(
-            "The rank of input Vec should be 1, but is %d", dim_vec.size()));
-    PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0],
-                      platform::errors::InvalidArgument(
-                          "X's second dimension is expected to be equal to "
-                          "Vec's first dimension"
-                          "but recieved X'shape = [%s], Vec's shape = [%s]",
-                          dim_x, dim_vec));
-
-    framework::DDim dim_out = phi::make_ddim({dim_x[0]});
-
-    context->SetOutputDim("Out", dim_out);
-    context->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 template <typename T>
@@ -118,7 +94,11 @@ class MVOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DECLARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor,
+                            PD_INFER_META(phi::MvInferMeta));
+
 REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker,
                   ops::MVOpGradMaker<paddle::framework::OpDesc>,
-                  ops::MVOpGradMaker<paddle::imperative::OpBase>);
+                  ops::MVOpGradMaker<paddle::imperative::OpBase>,
+                  MvInferShapeFunctor);
 REGISTER_OPERATOR(mv_grad, ops::MVOpGrad);
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index f510c7bebec876..6c35ad29e9749d 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/nll_loss_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -264,10 +264,3 @@ REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker,
                   ops::NLLLossGradMaker<paddle::framework::OpDesc>,
                   ops::NLLLossGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    nll_loss, ops::NLLLossOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NLLLossOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    nll_loss_grad,
-    ops::NLLLossGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NLLLossGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h
deleted file mode 100644
index be6f4422d4ac6a..00000000000000
--- a/paddle/fluid/operators/nll_loss_op.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data,
-                        const int64_t* label_data, const T* weight_data,
-                        const int64_t batch_size, const int64_t n_classes,
-                        const std::string reduction,
-                        const int64_t ignore_index) {
-  if (reduction == "none") {
-    for (int64_t i = 0; i < batch_size; ++i) {
-      const auto cur_label = label_data[i];
-      if (cur_label == ignore_index) {
-        out_data[i] = 0;
-        continue;
-      }
-      PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                        platform::errors::InvalidArgument(
-                            "Label value is out of range. "
-                            "Expected label value in range of [0, %d), but "
-                            "received value is %d.",
-                            n_classes, cur_label));
-
-      const auto cur_weight =
-          weight_data ? weight_data[cur_label] : static_cast<T>(1);
-      out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight;
-    }
-    return;
-  }
-
-  T output_val = 0;
-  T total_weight_val = 0;
-
-  for (int64_t i = 0; i < batch_size; i++) {
-    const auto cur_label = label_data[i];
-    if (cur_label == ignore_index) {
-      out_data[i] = 0;
-      continue;
-    }
-    PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                      platform::errors::InvalidArgument(
-                          "label should not be out of bounds."));
-
-    const auto cur_weight =
-        weight_data ? weight_data[cur_label] : static_cast<T>(1);
-    total_weight_val += cur_weight;
-    output_val -= x_data[i * n_classes + cur_label] * cur_weight;
-  }
-  if (reduction == "mean" && total_weight_val != 0) {
-    output_val /= total_weight_val;
-  }
-  *out_data = output_val;
-  *total_weight_data = total_weight_val;
-}
-
-template <typename T>
-static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data,
-                        const int64_t* label_data, const T* weight_data,
-                        const int64_t batch_size, const int64_t n_classes,
-                        const int64_t in_dim2, const int64_t in_dim3,
-                        const std::string reduction,
-                        const int64_t ignore_index) {
-  const auto map_size = in_dim2 * in_dim3;
-  const auto sample_size = n_classes * map_size;
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      for (int h = 0; h < in_dim2; h++) {
-        for (int w = 0; w < in_dim3; w++) {
-          const auto index = i * map_size + h * in_dim3 + w;
-          const auto cur_label = label_data[index];
-          if (cur_label == ignore_index) {
-            out_data[index] = 0;
-            continue;
-          }
-          PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                            platform::errors::InvalidArgument(
-                                "label should not be out of bounds."));
-          const auto cur_weight =
-              weight_data ? weight_data[cur_label] : static_cast<T>(1);
-          out_data[index] = -x_data[i * sample_size + cur_label * map_size +
-                                    h * in_dim3 + w] *
-                            cur_weight;
-        }
-      }
-    }
-    return;
-  }
-
-  T output_val = 0;
-  T total_weight_val = 0;
-
-  for (int i = 0; i < batch_size; i++) {
-    for (int h = 0; h < in_dim2; h++) {
-      for (int w = 0; w < in_dim3; w++) {
-        const auto index = i * map_size + h * in_dim3 + w;
-        const auto cur_label = label_data[index];
-        if (cur_label == ignore_index) {
-          out_data[index] = 0;
-          continue;
-        }
-        PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
-                          platform::errors::InvalidArgument(
-                              "label should not be out of bounds."));
-        const auto cur_weight =
-            weight_data ? weight_data[cur_label] : static_cast<T>(1);
-        total_weight_val += cur_weight;
-        output_val -=
-            x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] *
-            cur_weight;
-      }
-    }
-  }
-
-  if (reduction == "mean" && total_weight_val != 0) {
-    output_val /= total_weight_val;
-  }
-  *out_data = output_val;
-  *total_weight_data = total_weight_val;
-}
-
-template <typename DeviceContext, typename T>
-class NLLLossOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* total_weight = ctx.Output<Tensor>("Total_weight");
-    auto reduction = ctx.Attr<std::string>("reduction");
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-
-    auto x_data = x->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto total_weight_data = total_weight->mutable_data<T>(ctx.GetPlace());
-    *total_weight_data = 0;
-
-    auto x_dims = x->dims();
-    const auto batch_size = x_dims[0];
-    const auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      nll_loss_1D<T>(out_data, total_weight_data, x_data, label_data,
-                     weight_data, batch_size, n_classes, reduction,
-                     ignore_index);
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      nll_loss_2D<T>(out_data, total_weight_data, x_data, label_data,
-                     weight_data, batch_size, n_classes, in_dim2, in_dim3,
-                     reduction, ignore_index);
-    }
-  }
-};
-
-template <typename T>
-static void nll_loss_grad_1D(T* dx_data, const T* dout_data,
-                             const int64_t* label_data, const T* weight_data,
-                             const T* total_weight_data,
-                             const int64_t batch_size, const int64_t n_classes,
-                             const std::string reduction,
-                             const int64_t ignore_index) {
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      const auto cur_label = label_data[i];
-      if (cur_label == ignore_index) {
-        continue;
-      }
-      const auto cur_weight =
-          weight_data ? weight_data[cur_label] : static_cast<T>(1);
-      dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight;
-    }
-    return;
-  }
-
-  const T dout_val = *dout_data;
-  const T total_weight_val = *total_weight_data;
-  for (int i = 0; i < batch_size; i++) {
-    const auto cur_label = label_data[i];
-    if (cur_label == ignore_index) {
-      continue;
-    }
-    const auto cur_weight =
-        weight_data ? weight_data[cur_label] : static_cast<T>(1);
-    dx_data[i * n_classes + cur_label] = -dout_val * cur_weight;
-    if (reduction == "mean") {
-      dx_data[i * n_classes + cur_label] /= total_weight_val;
-    }
-  }
-}
-
-template <typename T>
-static void nll_loss_grad_2D(T* dx_data, const T* dout_data,
-                             const int64_t* label_data, const T* weight_data,
-                             const T* total_weight_data,
-                             const int64_t batch_size, const int64_t n_classes,
-                             const int64_t in_dim2, const int64_t in_dim3,
-                             const std::string reduction,
-                             const int64_t ignore_index) {
-  const auto map_size = in_dim2 * in_dim3;
-  const auto sample_size = n_classes * map_size;
-
-  if (reduction == "none") {
-    for (int i = 0; i < batch_size; i++) {
-      for (int h = 0; h < in_dim2; h++) {
-        for (int w = 0; w < in_dim3; w++) {
-          const auto index = i * map_size + h * in_dim3 + w;
-          const auto cur_label = label_data[index];
-          if (cur_label == ignore_index) {
-            continue;
-          }
-          const auto cur_weight =
-              weight_data ? weight_data[cur_label] : static_cast<T>(1);
-          dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] =
-              -cur_weight * dout_data[index];
-        }
-      }
-    }
-    return;
-  }
-
-  const T dout_val = *dout_data;
-  const T total_weight_val = *total_weight_data;
-  for (int i = 0; i < batch_size; i++) {
-    for (int h = 0; h < in_dim2; h++) {
-      for (int w = 0; w < in_dim3; w++) {
-        const auto index = i * map_size + h * in_dim3 + w;
-        const auto cur_label = label_data[index];
-        if (cur_label == ignore_index) {
-          continue;
-        }
-        const auto cur_weight =
-            weight_data ? weight_data[cur_label] : static_cast<T>(1);
-        const auto dx_index =
-            i * sample_size + cur_label * map_size + h * in_dim3 + w;
-        dx_data[dx_index] = -dout_val * cur_weight;
-        if (reduction == "mean") {
-          dx_data[dx_index] /= total_weight_val;
-        }
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class NLLLossGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* total_weight = ctx.Input<Tensor>("Total_weight");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto dout_data = dout->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto total_weight_data = total_weight->data<T>();
-    memset(dx_data, 0, dx->numel() * sizeof(T));
-
-    const auto x_dims = x->dims();
-    const auto batch_size = x_dims[0];
-    const auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      nll_loss_grad_1D(dx_data, dout_data, label_data, weight_data,
-                       total_weight_data, batch_size, n_classes, reduction,
-                       ignore_index);
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      nll_loss_grad_2D(dx_data, dout_data, label_data, weight_data,
-                       total_weight_data, batch_size, n_classes, in_dim2,
-                       in_dim3, reduction, ignore_index);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index ad7f93d73e902b..315831ddc0f290 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/adadelta_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,77 +26,6 @@ class AdadeltaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Param) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Grad) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("AvgSquaredGrad"), true,
-        platform::errors::InvalidArgument(
-            "Input(AvgSquaredGrad) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("AvgSquaredUpdate"), true,
-        platform::errors::InvalidArgument(
-            "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        true,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Param").front(),
-            ctx->GetInputsVarType("Param").front()));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        true,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Grad").front(),
-            ctx->GetInputsVarType("Grad").front()));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("ParamOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(ParamOut) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AvgSquaredGradOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(AvgSquaredGradOut) of AdadeltaOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("AvgSquaredUpdateOut"), true,
-        platform::errors::InvalidArgument(
-            "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."));
-
-    auto param_dim = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dim, ctx->GetInputDim("Grad"),
-        platform::errors::InvalidArgument(
-            "Param and grad input of AdadeltaOp should have same dimension."));
-    PADDLE_ENFORCE_NE(
-        phi::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
-        platform::errors::InvalidArgument(
-            "Maybe the Input variable AvgSquaredGrad has not "
-            "been initialized. You may need to confirm if you put "
-            "exe.run(startup_program) after optimizer.minimize "
-            "function."));
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
-                      platform::errors::InvalidArgument(
-                          "Param and AvgSquaredGrad input of AdadeltaOp "
-                          "should have same dimension"));
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
-                      platform::errors::InvalidArgument(
-                          "Param and AvgSquaredUpdate input of AdadeltaOp "
-                          "should have same dimension"));
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
-    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
@@ -149,7 +81,11 @@ param\_out = param + param\_update
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor,
+                            PD_INFER_META(phi::AdadeltaInferMeta));
+REGISTER_OPERATOR(
+    adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AdadeltaInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h
deleted file mode 100644
index 85cfad35858bbe..00000000000000
--- a/paddle/fluid/operators/optimizers/adadelta_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdadeltaOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(grad_var->Type())));
-
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto avg_squared_grad_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredGradOut");
-    auto avg_squared_update_out_tensor =
-        ctx.Output<framework::Tensor>("AvgSquaredUpdateOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
-    avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T rho = static_cast<T>(ctx.Attr<float>("rho"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    // Squared gradient accumulator
-    auto avg_squared_grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredGrad"));
-    // Squared updates accumulator
-    auto avg_squared_update = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("AvgSquaredUpdate"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto avg_squared_grad_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
-    auto avg_squared_update_out =
-        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    avg_squared_grad_out.device(place) =
-        rho * avg_squared_grad + (1 - rho) * grad.square();
-    auto update =
-        -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon))
-             .sqrt() *
-        grad;
-    avg_squared_update_out.device(place) =
-        rho * avg_squared_update + (1 - rho) * update.square();
-    param_out.device(place) = param + update;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
index a95a37c980c8c9..036839dd1300fe 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/adamax_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,67 +25,6 @@ class AdamaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("InfNorm"), "Input", "InfNorm", "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate",
-                   "Adamax");
-    OP_INOUT_CHECK(ctx->HasInput("Beta1Pow"), "Input", "Beta1Pow", "Adamax");
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Param").front(),
-        framework::proto::VarType::LOD_TENSOR,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Param").front(),
-            ctx->GetInputsVarType("Param").front()));
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputsVarType("Grad").front(),
-        framework::proto::VarType::LOD_TENSOR,
-        platform::errors::InvalidArgument(
-            "The input var's type should be LoDTensor, but the received is %s",
-            ctx->Inputs("Grad").front(),
-            ctx->GetInputsVarType("Grad").front()));
-
-    OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut", "Adamax");
-    OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut",
-                   "Adamax");
-    OP_INOUT_CHECK(ctx->HasOutput("InfNormOut"), "Output", "InfNormOut",
-                   "Adamax");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(phi::product(lr_dims), 0,
-                      platform::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-    PADDLE_ENFORCE_EQ(phi::product(lr_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Learning rate should have 1 dimension"));
-    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-    PADDLE_ENFORCE_EQ(phi::product(beta1_pow_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Beta1 power accumulator should have 1 dimension"));
-    auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        platform::errors::InvalidArgument(
-            "Param and Grad input of AdamaxOp should have same dimension"));
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment"),
-        platform::errors::InvalidArgument(
-            "Param and Moment input of AdamaxOp should have same dimension"));
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("InfNorm"),
-        platform::errors::InvalidArgument(
-            "Param and InfNorm input of AdamaxOp should have same dimension"));
-
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("MomentOut", param_dims);
-    ctx->SetOutputDim("InfNormOut", param_dims);
-  }
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
@@ -150,7 +92,11 @@ division by 0 error.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
+DECLARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor,
+                            PD_INFER_META(phi::AdamaxInferMeta));
+
+REGISTER_OPERATOR(
+    adamax, ops::AdamaxOp, ops::AdamaxOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AdamaxInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h
deleted file mode 100644
index df0112448b1cbc..00000000000000
--- a/paddle/fluid/operators/optimizers/adamax_op.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdamaxOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(grad_var->Type())));
-
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
-    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
-
-    param_out_tensor->mutable_data<T>(ctx.GetPlace());
-    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
-    inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
-
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    auto moment = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment"));
-    auto inf_norm = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("InfNorm"));
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
-    auto beta1_pow = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Beta1Pow"));
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto inf_norm_out =
-        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
-    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-
-    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
-    inf_norm_out.device(*place) =
-        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
-    auto lr_t = lr / (1 - beta1_pow);
-    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(*place) =
-        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index 2a127d9ad1db0c..21ca26f49f653d 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -124,8 +124,8 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor,
-                            PT_INFER_META(phi::PixelShuffleInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor,
+                            PD_INFER_META(phi::PixelShuffleInferMeta));
 
 REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
                   ops::PixelShuffleGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc
index 0cecbf0b9cb027..d5896c4105932e 100644
--- a/paddle/fluid/operators/poisson_op.cc
+++ b/paddle/fluid/operators/poisson_op.cc
@@ -87,8 +87,8 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker,
                   ops::PoissonOpInferVarType,
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index e0c24935b47509..d061f9ae056134 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -81,8 +81,12 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
       output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
     } else {
       for (size_t i = 0; i < ksize.size(); ++i) {
-        output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                                 paddings[i], strides[i]));
+        if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) {
+          output_shape.push_back(in_x_dims[i + 2]);
+        } else {
+          output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
+                                                   paddings[i], strides[i]));
+        }
       }
     }
     ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc
index 6b0d6f332bcae8..54e31845ad4bd5 100644
--- a/paddle/fluid/operators/put_along_axis_op.cc
+++ b/paddle/fluid/operators/put_along_axis_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/put_along_axis_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -123,16 +124,3 @@ REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker,
                   paddle::operators::PutAlongAxisInplaceInferer);
 
 REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp);
-
-REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel<float>,
-                       ops::PutAlongAxisOpKernel<double>,
-                       ops::PutAlongAxisOpKernel<int>,
-                       ops::PutAlongAxisOpKernel<uint8_t>,
-                       ops::PutAlongAxisOpKernel<int64_t>);
-
-REGISTER_OP_CPU_KERNEL(put_along_axis_grad,
-                       ops::PutAlongAxisGradOpKernel<float>,
-                       ops::PutAlongAxisGradOpKernel<double>,
-                       ops::PutAlongAxisGradOpKernel<int>,
-                       ops::PutAlongAxisGradOpKernel<uint8_t>,
-                       ops::PutAlongAxisGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu
deleted file mode 100644
index 5508023efad2c6..00000000000000
--- a/paddle/fluid/operators/put_along_axis_op.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/put_along_axis_op.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class PutAlongAxisCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisCUDAKernel only runs on GPU device."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto value = ctx.Input<Tensor>("Value");
-    auto index = ctx.Input<Tensor>("Index");
-    auto reduce_op = ctx.Attr<std::string>("Reduce");
-    auto result = ctx.Output<Tensor>("Result");
-    const platform::DeviceContext &device_ctx = ctx.device_context();
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    framework::TensorCopy(*input, ctx.GetPlace(), result);
-    if (reduce_op == "add") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_add_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_add_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "multiply" || reduce_op == "mul") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_mul_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_mul_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "assign") {
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_assign_kernel<T, int32_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_scatter_assign_kernel<T, int64_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not support reduce_op: '%s' for scatter kernel, only "
-          "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the "
-          "defalut reduce op is 'assign' ",
-          reduce_op));
-      return;
-    }
-  }
-};
-
-template <typename T>
-class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisGradOpCUDAKernel only runs on GPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto value_grad = ctx.Output<Tensor>(framework::GradVarName("Value"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (input_grad) {
-      framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad);
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_scatter_input_grad_kernel<T, int32_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      } else {
-        gpu_scatter_input_grad_kernel<T, int64_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      }
-    }
-    if (value_grad) {
-      value_grad->Resize(index->dims());
-      value_grad->mutable_data<T>(ctx.GetPlace());
-      if (index_type == framework::proto::VarType::INT32) {
-        gpu_gather_kernel<T, int32_t>(
-            *result_grad, axis, *index, *value_grad,
-            ctx.device_context());  // the gradient of scatter is gather
-      } else if (index_type == framework::proto::VarType::INT64) {
-        gpu_gather_kernel<T, int64_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel<float>,
-                        ops::PutAlongAxisCUDAKernel<double>,
-                        ops::PutAlongAxisCUDAKernel<int64_t>,
-                        ops::PutAlongAxisCUDAKernel<int>,
-                        ops::PutAlongAxisCUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(put_along_axis_grad,
-                        ops::PutAlongAxisGradOpCUDAKernel<float>,
-                        ops::PutAlongAxisGradOpCUDAKernel<double>,
-                        ops::PutAlongAxisGradOpCUDAKernel<int64_t>,
-                        ops::PutAlongAxisGradOpCUDAKernel<int>,
-                        ops::PutAlongAxisGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h
deleted file mode 100644
index 38487f5ce28c9e..00000000000000
--- a/paddle/fluid/operators/put_along_axis_op.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class PutAlongAxisOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisOpKernel only runs on CPU."));
-
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto value = ctx.Input<Tensor>("Value");
-    auto index = ctx.Input<Tensor>("Index");
-    auto reduce_op = ctx.Attr<std::string>("Reduce");
-    auto result = ctx.Output<Tensor>("Result");
-
-    framework::TensorCopy(*input, ctx.GetPlace(), result);
-    const platform::DeviceContext &device_ctx = ctx.device_context();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (reduce_op == "add") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_add_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_add_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "multiply" || reduce_op == "mul") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_mul_kernel<T, int32_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_mul_kernel<T, int64_t>(*result, axis, *index, *value,
-                                           device_ctx);
-      }
-    } else if (reduce_op == "assign") {
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_assign_kernel<T, int32_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_scatter_assign_kernel<T, int64_t>(*result, axis, *index, *value,
-                                              device_ctx);
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not support reduce_op: '%s' for scatter kernel, only "
-          "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the "
-          "defalut reduce "
-          "op is 'assign' ",
-          reduce_op));
-      return;
-    }
-  }
-};
-
-template <typename T>
-class PutAlongAxisGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "PutAlongAxisGradOpKernel only runs on CPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto value_grad = ctx.Output<Tensor>(framework::GradVarName("Value"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    if (input_grad) {
-      framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad);
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_scatter_input_grad_kernel<T, int32_t>(
-            // Here passing an unused argument *result_grad, because it's
-            // convenient to instantiate a bunch of template function with the
-            // same arguments list.
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      } else {
-        cpu_scatter_input_grad_kernel<T, int64_t>(
-            *result_grad, axis, *index, *input_grad, ctx.device_context());
-      }
-    }
-
-    if (value_grad) {
-      value_grad->Resize(index->dims());
-      value_grad->mutable_data<T>(ctx.GetPlace());
-      if (index_type == framework::proto::VarType::INT32) {
-        cpu_gather_kernel<T, int32_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      } else if (index_type == framework::proto::VarType::INT64) {
-        cpu_gather_kernel<T, int64_t>(*result_grad, axis, *index, *value_grad,
-                                      ctx.device_context());
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
index 24741efe426b18..c7e91ba35dee13 100644
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 21c23a7f602a35..4b6759ea165edf 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -70,9 +70,25 @@ BufferedReader::BufferedReader(
     stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
+
+#ifdef PADDLE_WITH_MLU
+  if (platform::is_mlu_place(place_)) {
+    int dev_idx = place_.device;
+    compute_stream_ =
+        ((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance()
+                                            .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::MluEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
   npu_buffer_.resize(buffer_size);
+  mlu_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -256,6 +272,56 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::NPUStreamSync(stream_.get());
     }
 #endif
+
+#ifdef PADDLE_WITH_MLU
+    if (platform::is_mlu_place(place_)) {
+      TensorVec &mlu = mlu_buffer_[i];
+      if (mlu.empty()) {
+        mlu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(
+            mlu.size(), cpu.size(),
+            platform::errors::InvalidArgument(
+                "Input tensor number on MLU and CPU devices are not matched. "
+                "The number on MLU is %d, on CPU is %d",
+                mlu.size(), cpu.size()));
+      }
+
+      std::vector<void *> mlu_ptrs;
+      mlu_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        mlu[i].Resize(cpu[i].dims());
+        mlu[i].set_layout(cpu[i].layout());
+        mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      platform::SetMLUDeviceId(place_.device);
+      PADDLE_ENFORCE_MLU_SUCCESS(
+          cnPlaceNotifier(events_[i].get(), compute_stream_));
+      PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get()));
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data();
+        auto mlu_ptr = mlu_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype());
+        if ((platform::is_mlu_place(cpu_place))) {
+          memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size,
+                       stream_.get());
+        } else {
+          memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size,
+                       stream_.get());
+          platform::MLUStreamSync(stream_.get());
+        }
+        mlu[i].set_lod(cpu[i].lod());
+      }
+      platform::MLUStreamSync(stream_.get());
+    }
+#endif
     return i;
   }));
 }
@@ -291,6 +357,8 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     *out = std::move(cuda_buffer_[i]);
   } else if (platform::is_npu_place(place_)) {
     *out = std::move(npu_buffer_[i]);
+  } else if (platform::is_mlu_place(place_)) {
+    *out = std::move(mlu_buffer_[i]);
   } else {
     *out = std::move(cpu_buffer_[i]);
   }
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 3d42486c6df881..f0f3b6b7f9fdfe 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -29,6 +29,11 @@
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
 #endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
+#endif
+
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -70,6 +75,7 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> cpu_buffer_;
   std::vector<TensorVec> cuda_buffer_;
   std::vector<TensorVec> npu_buffer_;
+  std::vector<TensorVec> mlu_buffer_;
   size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
@@ -82,6 +88,12 @@ class BufferedReader : public framework::DecoratedReader {
   std::shared_ptr<platform::NpuStreamObject> stream_;
   std::vector<std::shared_ptr<platform::NpuEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_MLU
+  mluStream compute_stream_;
+  std::shared_ptr<platform::MluStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::MluEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 28a8484f539fc9..18e444702fbb2c 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer,
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
-                            PT_INFER_META(phi::RealAndImagInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
+                            PD_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
index cb438b4a805726..41df8e4a15f093 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
@@ -14,15 +14,28 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
-REGISTER_REDUCE_OP(reduce_max);
-REGISTER_OP_CPU_KERNEL(
-    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                  ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::MaxFunctor>);
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace ops = paddle::operators;
+
+class ReduceMaxOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_max"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_max"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_max, ReduceMaxInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+REGISTER_OPERATOR(
+    reduce_max, ops::ReduceOp, ReduceMaxOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceMaxInferShapeFunctor);
+REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp)
+
 REGISTER_OP_CPU_KERNEL(
     reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                            float, ops::MaxOrMinGradFunctor>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index 6157a3a925de51..4a18330913803f 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -96,8 +96,8 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce reduce_mean"; }
 };
 
-DELCARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor,
-                            PT_INFER_META(phi::MeanRawInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
 
 REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__,
                   ops::ReduceMeanOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index eb76eee1048890..160617695338a9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -36,9 +36,9 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
                       gpuStream_t stream) {
   y->mutable_data<Ty>(x.place());
 
-  phi::funcs::TensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
+  phi::funcs::ReduceKernel<Tx, Ty, ReduceOp, TransformOp>(
       static_cast<const phi::GPUContext&>(dev_ctx), x, y, transform,
-      origin_reduce_dims, stream);
+      origin_reduce_dims);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 8ef0712dc7a757..6441d53239e955 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -102,8 +102,8 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce reduce_sum"; }
 };
 
-DELCARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor,
-                            PT_INFER_META(phi::ReduceInferMetaBase));
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor,
+                            PD_INFER_META(phi::SumRawInferMeta));
 
 REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker,
                   ops::ReduceSumVarTypeInference,
diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h
index b636184ae457ed..a473b54c1f8559 100644
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
@@ -16,9 +16,9 @@ limitations under the License. */
 #include <type_traits>
 #include <vector>
 
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/unique_op.h"
@@ -36,6 +36,14 @@ using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 using TensorList = std::vector<framework::Tensor>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR)                      \
   inline bool is_##MODE_NAME(const framework::ExecutionContext& ctx) { \
     const std::string& mode = ctx.Attr<std::string>("mode");           \
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index e4410b21b54132..cbf2b9152079e1 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -121,8 +121,8 @@ DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker,
                   ops::ScaleGradMaker<paddle::framework::OpDesc>,
                   ops::ScaleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index b7be4cfb2a3950..0ae0e1500c1662 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -119,12 +119,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ScatterNdAddGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor,
-                            PT_INFER_META(phi::ScatterNdAddInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterNdAddInferMeta));
 
-DELCARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad,
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad,
                             ScatterNdAddGradInferShapeFunctor,
-                            PT_INFER_META(phi::ScatterNdAddGradInferMeta));
+                            PD_INFER_META(phi::ScatterNdAddGradInferMeta));
 
 REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker,
                   ops::ScatterNdAddGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index fec003305fdc65..5f6b04cf59e0e3 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -103,11 +103,11 @@ DECLARE_INPLACE_OP_INFERER(ScatterInplaceInferer, {"X", "Out"});
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor,
-                            PT_INFER_META(phi::ScatterInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterInferMeta));
 
-DELCARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor,
-                            PT_INFER_META(phi::ScatterGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor,
+                            PD_INFER_META(phi::ScatterGradInferMeta));
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc
index 322cd97f01c3ad..9d4c8532a82c06 100644
--- a/paddle/fluid/operators/segment_pool_op.cc
+++ b/paddle/fluid/operators/segment_pool_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/segment_pool_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,22 +26,6 @@ class SegmentPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool");
-    OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds",
-                   "SegmentPool");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool");
-    auto dims = ctx->GetInputDim("X");
-    dims[0] = -1;
-    ctx->SetOutputDim("Out", dims);
-
-    if (ctx->Attrs().Get<std::string>("pooltype") == "MEAN") {
-      OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds",
-                     "SegmentPool");
-      ctx->SetOutputDim("SummedIds", {-1, 1});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -150,17 +137,11 @@ class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(segment_pool, SegmentPoolInferShapeFunctor,
+                            PD_INFER_META(phi::SegmentPoolInferMeta));
+
 REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker,
                   ops::SegmentPoolGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>);
+                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>,
+                  SegmentPoolInferShapeFunctor);
 REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    segment_pool,
-    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    segment_pool_grad,
-    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
deleted file mode 100644
index e147e62a983540..00000000000000
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/segment_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    segment_pool,
-    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SegmentPoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    segment_pool_grad,
-    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SegmentPoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
deleted file mode 100644
index 2f5ef7f54f9888..00000000000000
--- a/paddle/fluid/operators/segment_pool_op.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T, typename IndexT>
-void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
-  auto* input = context.Input<Tensor>("X");
-  auto* segment = context.Input<Tensor>("SegmentIds");
-  auto* output = context.Output<Tensor>("Out");
-  std::string pooltype = context.Attr<std::string>("pooltype");
-  Tensor* summed_ids = nullptr;
-
-  int64_t num_indices = segment->numel();
-  PADDLE_ENFORCE_EQ(
-      num_indices, input->dims()[0],
-      platform::errors::InvalidArgument(
-          "Segment_ids should be the same size as dimension 0 of input X."));
-  PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0],
-                    platform::errors::InvalidArgument(
-                        "Segment_ids should be 1-D tensor, or it's other "
-                        "dimension size is 1. Segment_ids's shape is: [%s].",
-                        segment->dims()));
-
-  if (input->numel() == 0 || segment->numel() == 0) {
-    return;
-  }
-
-  bool cpu_place = context.GetPlace().GetType() == phi::AllocationType::CPU;
-  if (cpu_place) {
-    auto dims = input->dims();
-    auto* segment_ids = segment->data<IndexT>();
-    dims[0] = static_cast<int64_t>(segment_ids[segment->numel() - 1] + 1);
-    PADDLE_ENFORCE_GT(
-        dims[0], 0,
-        platform::errors::InvalidArgument(
-            "Segment ids must be >= 0, but got last id %d", dims[0]));
-    output->Resize({dims});
-    output->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, output, static_cast<T>(0));
-  }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (!cpu_place) {
-    Tensor length;
-    length.mutable_data<IndexT>(phi::make_ddim({1}), platform::CPUPlace());
-    IndexT* length_data = length.data<IndexT>();
-    const IndexT* segment_ids = segment->data<IndexT>();
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
-                  hipMemcpyDeviceToHost));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
-                   cudaMemcpyDeviceToHost));
-#endif
-
-    IndexT length_host = length_data[0];
-    length_host++;
-    PADDLE_ENFORCE_GT(
-        length_host, 0,
-        platform::errors::InvalidArgument(
-            "Segment ids must be >= 0, but got last id %d", length_data[0]));
-    auto dims = input->dims();
-    dims[0] = static_cast<int64_t>(length_host);
-    output->Resize({dims});
-    output->mutable_data<T>(context.GetPlace());
-    T init_value = 0;
-    if (pooltype == "MAX") {
-      init_value = static_cast<T>(-FLT_MAX);
-    } else if (pooltype == "MIN") {
-      init_value = static_cast<T>(FLT_MAX);
-    }
-    phi::funcs::SetConstant<DeviceContext, T> setconst;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    setconst(dev_ctx, output, static_cast<T>(init_value));
-    // the gpu kernel of mean pool record the counts of segment_ids
-    if (pooltype == "MEAN") {
-      summed_ids = context.Output<Tensor>("SummedIds");
-      summed_ids->Resize({dims[0], 1});
-      summed_ids->mutable_data<T>(context.GetPlace());
-      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
-    }
-  }
-#endif
-
-  SegmentPoolFunctor<DeviceContext, T, IndexT> pool;
-
-  pool(context.template device_context<DeviceContext>(), *input, *segment,
-       output, summed_ids, pooltype);
-}
-
-template <typename DeviceContext, typename T>
-class SegmentPoolKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* segment = context.Input<Tensor>("SegmentIds");
-    auto index_type = framework::TransToProtoVarType(segment->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      SegmentKernelLaunchHelper<DeviceContext, T, int>(context);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      SegmentKernelLaunchHelper<DeviceContext, T, int64_t>(context);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported index type, Expected int, int64, but got %s.",
-          index_type));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SegmentPoolGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Input<Tensor>("Out");
-    auto* segment = context.Input<Tensor>("SegmentIds");
-    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_g = context.Output<Tensor>(framework::GradVarName("X"));
-    std::string pooltype = context.Attr<std::string>("pooltype");
-
-    const Tensor* summed_ids = nullptr;
-    if (pooltype == "MEAN") {
-      summed_ids = context.Input<Tensor>("SummedIds");
-    }
-
-    in_g->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, in_g, static_cast<T>(0));
-
-    auto index_type = framework::TransToProtoVarType(segment->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      SegmentPoolGradFunctor<DeviceContext, T, int> pool;
-      pool(context.template device_context<DeviceContext>(), *input, *output,
-           *out_g, *segment, in_g, summed_ids, pooltype);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      SegmentPoolGradFunctor<DeviceContext, T, int64_t> pool;
-      pool(context.template device_context<DeviceContext>(), *input, *output,
-           *out_g, *segment, in_g, summed_ids, pooltype);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported index type, Expected int, int64, but got %s.",
-          index_type));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc
index 0372a79b967a48..59c6e16535738b 100644
--- a/paddle/fluid/operators/selu_op.cc
+++ b/paddle/fluid/operators/selu_op.cc
@@ -120,8 +120,8 @@ class SeluGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType,
                   ops::SeluGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
index 6c33ff52044b26..23c6a0133e1eda 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
@@ -184,9 +184,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
         col_data, paddle::platform::errors::Fatal("XPU memory is not enough"));
 
     if (in_g || filter_g) {
-      int r = xpu::constant<T>(xpu_context, col_data, col_numel, T(0));
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
-
       bool trans_a = false;
       bool trans_b = true;
       int m = out_g->dims()[0];
@@ -208,7 +205,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
       const T* data_b = filter->data<T>();
       T* data_c = col_data;
 
-      r = xpu::fc_fusion<T, T, T, int32_t>(
+      int r = xpu::fc_fusion<T, T, T, int32_t>(
           xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b,
           nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr,
           xpu::Activation_t::LINEAR);
@@ -222,7 +219,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
 
       in_g->mutable_data<T>(context.GetPlace());
       in_g->set_lod(in->lod());
-      xpu::constant<T>(xpu_context, in_g->data<T>(), in_g->numel(), T(0));
 
       int r = xpu::sequence_context_projection_grad<T, int>(
           xpu_context, in_g->data<T>(), col_data, nullptr, lodx, sequence_width,
@@ -232,8 +228,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
 
     if (filter_g) {
       filter_g->mutable_data<T>(context.GetPlace());
-      xpu::constant<T>(xpu_context, filter_g->data<T>(), filter_g->numel(),
-                       T(0));
 
       int r = xpu::sequence_context_projection<T, int>(
           xpu_context, in->data<T>(), col_data, nullptr, lodx, sequence_width,
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index ec3e04e71faf0b..7d0d782b837c4c 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -241,13 +241,6 @@ REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
                   ops::SetValueGradMaker<paddle::imperative::OpBase>,
                   ops::SetValueOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    set_value, ops::SetValueKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SetValueKernel<plat::CPUDeviceContext, int64_t>,
-    ops::SetValueKernel<plat::CPUDeviceContext, float>,
-    ops::SetValueKernel<plat::CPUDeviceContext, double>,
-    ops::SetValueKernel<plat::CPUDeviceContext, bool>);
-
 REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu
index f9701b0acaac76..9f291a863c067a 100644
--- a/paddle/fluid/operators/set_value_op.cu
+++ b/paddle/fluid/operators/set_value_op.cu
@@ -16,13 +16,6 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    set_value, ops::SetValueKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SetValueKernel<paddle::platform::CUDADeviceContext, bool>);
-
 REGISTER_OP_CUDA_KERNEL(
     set_value_grad,
     ops::SetValueGradKernel<paddle::platform::CUDADeviceContext, int>,
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 9dd727959202c6..4d459f8c01b159 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -121,201 +121,6 @@ inline void CheckIsDimsMatch(const framework::DDim first,
       "of target shape: %d, but now shape is %d.",
       second.to_str(), first.to_str()));
 }
-
-template <typename DeviceContext, typename T>
-class SetValueKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const int rank = ctx.Input<framework::LoDTensor>("Input")->dims().size();
-
-    // TODO(liym27): A more elegent code to do this. C++ has to make template
-    //  integer as constant, but we had better have alternative writing in the
-    //  future.
-    switch (rank) {
-      case 1:
-        SetValueCompute<1>(ctx);
-        break;
-      case 2:
-        SetValueCompute<2>(ctx);
-        break;
-      case 3:
-        SetValueCompute<3>(ctx);
-        break;
-      case 4:
-        SetValueCompute<4>(ctx);
-        break;
-      case 5:
-        SetValueCompute<5>(ctx);
-        break;
-      case 6:
-        SetValueCompute<6>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of input should be less than 7, but received %d.", rank));
-    }
-  }
-
- private:
-  template <size_t D>
-  void SetValueCompute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::LoDTensor>("Input");
-    auto* value_tensor = ctx.Input<framework::LoDTensor>("ValueTensor");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-
-    auto starts_tensor_list =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
-    auto ends_tensor_list = ctx.MultiInput<framework::Tensor>("EndsTensorList");
-    auto steps_tensor_list =
-        ctx.MultiInput<framework::Tensor>("StepsTensorList");
-
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    auto starts = ctx.Attr<std::vector<int64_t>>("starts");
-    auto ends = ctx.Attr<std::vector<int64_t>>("ends");
-    auto steps = ctx.Attr<std::vector<int64_t>>("steps");
-    auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-    auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
-    auto none_axes = ctx.Attr<std::vector<int64_t>>("none_axes");
-
-    if (!starts_tensor_list.empty()) {
-      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
-    }
-    if (!ends_tensor_list.empty()) {
-      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
-    }
-    if (!steps_tensor_list.empty()) {
-      steps = GetDataFromTensorList<int64_t>(steps_tensor_list);
-    }
-
-    auto in_dims = in->dims();
-    CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
-    auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps);
-    auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
-
-    auto slice_dims_for_assign = decrease_slice_dims;
-    if (!none_axes.empty()) {
-      std::vector<int64_t> slice_dims_with_none;
-
-      size_t none_axes_cur = 0, decrease_axes_cur = 0;
-      for (int i = 0; i < slice_dims.size(); ++i) {
-        while (none_axes_cur < none_axes.size() &&
-               none_axes[none_axes_cur] <= i) {
-          slice_dims_with_none.push_back(1);
-          none_axes_cur++;
-        }
-        if (decrease_axes_cur < decrease_axes.size() &&
-            decrease_axes[decrease_axes_cur] == i) {
-          decrease_axes_cur++;
-        } else {
-          slice_dims_with_none.push_back(slice_dims[i]);
-        }
-      }
-      while (none_axes_cur < none_axes.size()) {
-        slice_dims_with_none.push_back(1);
-        none_axes_cur++;
-      }
-
-      slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
-    }
-
-    auto place = ctx.GetPlace();
-    auto& eigen_place =
-        *ctx.template device_context<DeviceContext>().eigen_device();
-
-    // Here copy data from input to avoid data loss at PE and Graph level.
-    // TODO(liym27): Speed up in the future version.
-    // - Q: Why don't call ShareDataWith to speed up?
-    // - A: Because it's not supported to ShareDataWith on OP's input and output
-    // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP
-    // - Q: Why don't delete Input, after all, the input and output are the same
-    // Tensor at program level?
-    // - A: If deleting Input, the graph will be complex, such as there will
-    // be two ops points to the output in graph: op1 -> output <- set_value.
-    // In this case, we have to find a way to handle the running order of
-    // set_value is what we want.
-    paddle::framework::TensorCopy(*in, place, out);
-
-    Tensor slice_tensor(in->dtype()), pad_tensor(in->dtype());
-    slice_tensor.mutable_data<T>(slice_dims, place);
-    pad_tensor.mutable_data<T>(in_dims, place);
-
-    auto pad_e = framework::EigenTensor<T, D>::From(pad_tensor, in_dims);
-    auto out_e = framework::EigenTensor<T, D>::From(*out);
-    auto slice_e = framework::EigenTensor<T, D>::From(slice_tensor, slice_dims);
-
-    // Step 1: Set the value of out at `_index` to zero
-    slice_e.device(eigen_place) = slice_e.constant(T(0));
-
-    auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-
-    for (size_t i = 0; i < D; ++i) {
-      starts_indices[i] = 0;
-      ends_indices[i] = slice_dims[i];
-      strides_indices[i] = 1;
-    }
-    for (size_t i = 0; i < axes.size(); i++) {
-      int axis_index = axes[i];
-      starts_indices[axis_index] = starts[i];
-      ends_indices[axis_index] = ends[i];
-      strides_indices[axis_index] = steps[i];
-      if (starts[i] == ends[i]) {  // slice is empty, data will not be changed
-        return;
-      }
-    }
-
-    out_e.stridedSlice(starts_indices, ends_indices, strides_indices)
-        .device(eigen_place) = slice_e;
-
-    // Step 2: Set a tensor with the same shape as out tensor. And its data at
-    // '_index' is the same as value_tensor, and data out of '_index' to zero
-
-    // - Step 2.1 Set slice tensor with value
-
-    // NOTE(liym27): [ Why resize slice_tensor here? ]
-    // A: When do broadcasting on slice_tensor and value_tensor, the shape of
-    // slice_tensor should be decreased dims.
-    // e.g.
-    //  x[:,0] = value_tensor
-    // x's shape = [3, 4], value_tensor's shape = [3]
-    // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
-    // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
-    // shape is [3, 3], which cross the border;
-    // If do broadcasting on Tensor with shape [3] and [3], the result's shape
-    // is [3], which is right.
-
-    slice_tensor.Resize(slice_dims_for_assign);
-    if (value_tensor != nullptr) {
-      CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims());
-      // ElementwiseComputeEx can do broadcasting
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_tensor, value_tensor, -1, SubFunctor<T>(), &slice_tensor);
-    } else {
-      Tensor value_t(in->dtype());
-      auto value_dims = phi::make_ddim(shape);
-      CheckIsDimsMatch(slice_dims_for_assign, value_dims);
-
-      value_t.mutable_data<T>(value_dims, place);
-      auto value_name =
-          GetValueName(framework::TransToProtoVarType(in->dtype()));
-      CopyVecotorToTensor<T>(value_name.c_str(), &value_t, ctx);
-      value_t.Resize(value_dims);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_tensor, &value_t, -1, SubFunctor<T>(), &slice_tensor);
-    }
-    slice_tensor.Resize(slice_dims);
-
-    // - Step 2.2 Pad slice tensor with 0
-    pad_e.device(eigen_place) = pad_e.constant(T(0));
-    pad_e.stridedSlice(starts_indices, ends_indices, strides_indices)
-        .device(eigen_place) = slice_e;
-
-    // Step 3: Set out tensor with value_tensor
-    out_e.device(eigen_place) = out_e - pad_e;
-  }
-};
-
 template <typename DeviceContext, typename T>
 class SetValueGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index 599697059c4dcf..46d64333b608b7 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -174,6 +174,9 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
         .AddInput(std::move(index_indices))
         .AddInput(val_temp)
         .AddOutput(out_temp)
+#if (CANN_VERSION_CODE >= 504001)
+        .AddAttrs({{"use_locking", false}})
+#endif
         .Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 5b7ccdde81097a..e2c8359beb1290 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/shape_op.h"
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -95,9 +93,3 @@ REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
-                       ops::ShapeKernel<int8_t>, ops::ShapeKernel<uint8_t>,
-                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>,
-                       ops::ShapeKernel<plat::complex<float>>,
-                       ops::ShapeKernel<plat::complex<double>>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
deleted file mode 100644
index c6e380a94f84db..00000000000000
--- a/paddle/fluid/operators/shape_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/shape_op.h"
-#include "paddle/fluid/platform/complex.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    shape, paddle::operators::ShapeKernel<bool>,
-    paddle::operators::ShapeKernel<int>, paddle::operators::ShapeKernel<int8_t>,
-    paddle::operators::ShapeKernel<uint8_t>,
-    paddle::operators::ShapeKernel<int64_t>,
-    paddle::operators::ShapeKernel<float>,
-    paddle::operators::ShapeKernel<double>,
-    paddle::operators::ShapeKernel<paddle::platform::float16>,
-    paddle::operators::ShapeKernel<paddle::platform::complex<float>>,
-    paddle::operators::ShapeKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
deleted file mode 100644
index 39ebcca46a710e..00000000000000
--- a/paddle/fluid/operators/shape_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = phi::SelectedRows;
-
-template <typename T>
-class ShapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_var = ctx.InputVar("Input");
-    framework::DDim in_dims;
-    if (in_var->IsType<phi::SelectedRows>()) {
-      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
-    } else {
-      in_dims = in_var->Get<LoDTensor>().dims();
-    }
-    auto* out_t = ctx.Output<Tensor>("Out");
-    out_t->Resize({in_dims.size()});
-    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
-    for (int i = 0; i < in_dims.size(); ++i) {
-      out_data[i] = in_dims[i];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
index 7bff7b2d668347..f751ab41014c21 100644
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc
index 2e9092a6432538..a62d1b434e7643 100644
--- a/paddle/fluid/operators/shape_op_xpu.cc
+++ b/paddle/fluid/operators/shape_op_xpu.cc
@@ -10,12 +10,41 @@
  *     limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
 
-#include "paddle/fluid/operators/shape_op.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = phi::SelectedRows;
+
+template <typename T>
+class ShapeXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<phi::SelectedRows>()) {
+      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
-                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
-                       ops::ShapeKernel<double>);
+REGISTER_OP_XPU_KERNEL(shape, ops::ShapeXPUKernel<bool>,
+                       ops::ShapeXPUKernel<int>, ops::ShapeXPUKernel<int64_t>,
+                       ops::ShapeXPUKernel<float>, ops::ShapeXPUKernel<double>);
 
 #endif
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 8e502fc04dbdb0..016ff54645b02e 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -15,7 +15,10 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -26,46 +29,6 @@ const int kIgnoreIndex = -100;
 class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
-                   "SigmoidCrossEntropyWithLogitsOp");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label",
-                   "SigmoidCrossEntropyWithLogitsOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
-                   "SigmoidCrossEntropyWithLogitsOp");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Label");
-
-    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(X) and Input(Label) shall have the same rank."
-                          "But received: the rank of Input(X) is [%d], "
-                          "the rank of Input(Label) is [%d].",
-                          rank, labels_dims.size()));
-
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
-      check = false;
-    }
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          phi::slice_ddim(x_dims, 0, rank),
-          phi::slice_ddim(labels_dims, 0, rank),
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Label) shall have the same shape "
-              "except the last dimension. But received: the shape of "
-              "Input(X) is [%s], the shape of Input(Label) is [%s].",
-              x_dims, labels_dims));
-    }
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class SigmoidCrossEntropyWithLogitsGradOp
@@ -201,12 +164,17 @@ DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(
+    sigmoid_cross_entropy_with_logits,
+    SigmoidCrossEntropyWithLogitsInferShapeFunctor,
+    PD_INFER_META(phi::SigmoidCrossEntropyWithLogitsInferMeta));
 REGISTER_OPERATOR(
     sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp,
     ops::SigmoidCrossEntropyWithLogitsOpMaker,
     ops::SigmoidCrossEntropyWithLogitsGradOpMaker<paddle::framework::OpDesc>,
     ops::SigmoidCrossEntropyWithLogitsGradOpMaker<paddle::imperative::OpBase>,
-    ops::SigmoidCrossEntropyWithLogitsInplaceInferer);
+    ops::SigmoidCrossEntropyWithLogitsInplaceInferer,
+    SigmoidCrossEntropyWithLogitsInferShapeFunctor);
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
                   ops::SigmoidCrossEntropyWithLogitsGradOp,
                   ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer);
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index e2381c76f7e45a..ceb42dcf3e5921 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -60,8 +60,8 @@ class SignGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
-DELCARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker<paddle::framework::OpDesc>,
                   ops::SignGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index e584c1a4cce1e8..84b0f403be0389 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -44,8 +44,8 @@ Return the number of elements in the input.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor,
-                            PT_INFER_META(phi::SizeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor,
+                            PD_INFER_META(phi::SizeInferMeta));
 REGISTER_OPERATOR(
     size, ops::SizeOp, ops::SizeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index 3bc55fafd81e18..3148b31a8322e2 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
index 956544c53609eb..d61f5aa3f634cd 100644
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index c92d468f3462c9..af29aac6b90528 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -109,6 +109,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
       auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
       memory::Copy(npu_place, dst + i * dst_after, npu_place,
                    src + i * src_after, sizeof(T) * size, npu_ctx.stream());
+#elif defined(PADDLE_WITH_MLU)
+      auto& mlu_place = place;
+      auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
+      memory::Copy(mlu_place, dst + i * dst_after, mlu_place,
+                   src + i * src_after, sizeof(T) * size, mlu_ctx.stream());
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "Paddle is not compiled with GPU."));
diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc
index 664f1031915e46..fa8a5e92712ec8 100644
--- a/paddle/fluid/operators/take_along_axis_op.cc
+++ b/paddle/fluid/operators/take_along_axis_op.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/take_along_axis_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -139,16 +140,3 @@ REGISTER_OPERATOR(take_along_axis, ops::TakeAlongAxisOp,
                   ops::TakeAlongAxisGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp);
-
-REGISTER_OP_CPU_KERNEL(take_along_axis, ops::TakeAlongAxisOpKernel<float>,
-                       ops::TakeAlongAxisOpKernel<double>,
-                       ops::TakeAlongAxisOpKernel<int>,
-                       ops::TakeAlongAxisOpKernel<uint8_t>,
-                       ops::TakeAlongAxisOpKernel<int64_t>);
-
-REGISTER_OP_CPU_KERNEL(take_along_axis_grad,
-                       ops::TakeAlongAxisGradOpKernel<float>,
-                       ops::TakeAlongAxisGradOpKernel<double>,
-                       ops::TakeAlongAxisGradOpKernel<int>,
-                       ops::TakeAlongAxisGradOpKernel<uint8_t>,
-                       ops::TakeAlongAxisGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu
deleted file mode 100644
index b6c62d497b379d..00000000000000
--- a/paddle/fluid/operators/take_along_axis_op.cu
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/take_along_axis_op.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class TakeAlongAxisCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<Tensor>("Index");
-    auto result = ctx.Output<Tensor>("Result");
-    result->Resize(index->dims());
-    result->mutable_data<T>(ctx.GetPlace());
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      gpu_gather_kernel<T, int32_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    } else if (index_type == framework::proto::VarType::INT64) {
-      gpu_gather_kernel<T, int64_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    }
-  }
-};
-
-template <typename T>
-class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on GPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    // We need to know the shape of input matrix to determine the shape of grad
-    // matrix of input.
-    auto input = ctx.Input<Tensor>("Input");
-    input_grad->Resize(input->dims());
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    // Set to zero tensor.
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
-            input_grad, static_cast<T>(0));
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-
-    if (index_type == framework::proto::VarType::INT32) {
-      gpu_scatter_add_kernel<T, int32_t>(
-          *input_grad, axis, *index, *result_grad,
-          ctx.device_context());  // the gradient of gather is scatter
-    } else if (index_type == framework::proto::VarType::INT64) {
-      gpu_scatter_add_kernel<T, int64_t>(*input_grad, axis, *index,
-                                         *result_grad, ctx.device_context());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(take_along_axis, ops::TakeAlongAxisCUDAKernel<float>,
-                        ops::TakeAlongAxisCUDAKernel<double>,
-                        ops::TakeAlongAxisCUDAKernel<int64_t>,
-                        ops::TakeAlongAxisCUDAKernel<int>,
-                        ops::TakeAlongAxisCUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(take_along_axis_grad,
-                        ops::TakeAlongAxisGradOpCUDAKernel<float>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<double>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<int64_t>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<int>,
-                        ops::TakeAlongAxisGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h
deleted file mode 100644
index fc781dbddf2ad2..00000000000000
--- a/paddle/fluid/operators/take_along_axis_op.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class TakeAlongAxisOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto input = ctx.Input<Tensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<Tensor>("Index");
-    auto result = ctx.Output<Tensor>("Result");
-    result->Resize(index->dims());
-    result->mutable_data<T>(ctx.GetPlace());
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      cpu_gather_kernel<T, int32_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    } else if (index_type == framework::proto::VarType::INT64) {
-      cpu_gather_kernel<T, int64_t>(*input, axis, *index, *result,
-                                    ctx.device_context());
-    }
-  }
-};
-
-template <typename T>
-class TakeAlongAxisGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
-    auto axis = ctx.Attr<int>("Axis");
-    // We need to know the shape of input matrix to determine the shape of grad
-    // matrix of input.
-    auto input = ctx.Input<Tensor>("Input");
-    input_grad->Resize(input->dims());
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    // Set to zero tensor.
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
-            input_grad, static_cast<T>(0));
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      cpu_scatter_add_kernel<T, int32_t>(
-          *input_grad, axis, *index, *result_grad,
-          ctx.device_context());  // the gradient of gather is scatter
-    } else if (index_type == framework::proto::VarType::INT64) {
-      cpu_scatter_add_kernel<T, int64_t>(*input_grad, axis, *index,
-                                         *result_grad, ctx.device_context());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc
index a7c7e33f58af6c..1de1b590a1311b 100644
--- a/paddle/fluid/operators/test_common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/phi/core/ddim.h"
 
-USE_OP(relu);
+USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(softmax);
 
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index d60976928e00cb..80c9935057cb5d 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -51,6 +51,19 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+inline void GetDims(const phi::DDim& dim, int axis, int* pre, int* n,
+                    int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
 struct SegmentOffsetIter {
   EIGEN_DEVICE_FUNC
   explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index 810afc901df57b..d1add111e1d24c 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #include <memory>
 
+#include "paddle/fluid/framework/op_registry.h"
+
 namespace paddle {
 namespace operators {
 
@@ -173,15 +174,3 @@ REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
                   ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
-
-REGISTER_OP_CPU_KERNEL(top_k_v2,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, float>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, double>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int32_t>,
-                       ops::TopkV2Kernel<paddle::platform::CPUPlace, int64_t>)
-
-REGISTER_OP_CPU_KERNEL(
-    top_k_v2_grad, ops::TopkV2GradKernel<paddle::platform::CPUPlace, float>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, double>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::TopkV2GradKernel<paddle::platform::CPUPlace, int64_t>)
diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu
deleted file mode 100644
index 84d8eef53bf72c..00000000000000
--- a/paddle/fluid/operators/top_k_v2_op.cu
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-#include "paddle/fluid/operators/top_k_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#define FIXED_BLOCK_DIM_BASE(dim, ...) \
-  case (dim): {                        \
-    constexpr auto kBlockDim = (dim);  \
-    __VA_ARGS__;                       \
-  } break
-
-#define FIXED_BLOCK_DIM(...)                \
-  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
-
-template <typename DeviceContext, typename T>
-class TopkV2OpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
-
-    // get the attributes
-    int k = static_cast<int>(ctx.Attr<int>("k"));
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    const bool& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
-    const bool& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
-
-    // get the input dims
-    const auto& in_dims = input->dims();
-    // calcluate the real axis
-    if (axis < 0) axis += in_dims.size();
-
-    auto* k_t = ctx.Input<Tensor>("K");
-    if (k_t) {
-      Tensor k_host;
-      framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
-      k = k_host.data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    const auto& out_dims = output->dims();
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      // if get the topK from the last axis
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-
-      if (k > input_width) k = input_width;
-
-      // The conclusion is drawn from the data through multiple sets of
-      // statistics
-      if (input_width >= 128 && k >= input_width * 0.75) {
-        if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
-                        indices, largest)) {
-          // Successed, return.
-          return;
-        } else {
-          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
-                       "default topk kernel.";
-        }
-      }
-
-      // NOTE: pass lds and dim same to input width.
-      // NOTE: old matrix implementation of stride is different to eigen.
-      const int kMaxHeight = 2048;
-      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-      switch (GetDesiredBlockDim(input_width)) {
-#ifdef PADDLE_WITH_HIP
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 20,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                output_data, k, indices_data, input_data, input_width,
-                input_width, static_cast<int>(k), gridx, input_height,
-                largest));
-#else
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 5,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                output_data, k, indices_data, input_data, input_width,
-                input_width, static_cast<int>(k), gridx, input_height,
-                largest));
-#endif
-        default:
-          PADDLE_THROW(platform::errors::Fatal(
-              "the input data shape has error in the topk cuda kernel."));
-      }
-    } else {
-      // if get topK not from the last axis, will tranpose the tensor and get
-      // TopK
-
-      // first step, prepare the trans args for the tranpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(output->dims());
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = out_dims[trans[i]];
-      }
-      // second step, tranpose the input
-      Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans);
-      // third step, calcluate the topk
-      // allocate the tmp cuda memory for the tmp result
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
-      Tensor trans_out;
-      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      if (k > input_width) k = input_width;
-
-      // The conclusion is drawn from the data through multiple sets of
-      // statistics
-      if (input_width >= 128 && k >= input_width * 0.75) {
-        if (SortTopk<T>(dev_ctx, &trans_input, input_width, input_height, k,
-                        &trans_out, &trans_ind, largest)) {
-          // last step, tranpose back the indices and output
-          TransCompute<platform::CUDADeviceContext, int64_t>(
-              ndims, dev_ctx, trans_ind, indices, trans);
-          TransCompute<platform::CUDADeviceContext, T>(
-              ndims, dev_ctx, trans_out, output, trans);
-          return;
-        } else {
-          LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
-                       "default topk kernel.";
-        }
-      }
-
-      const int kMaxHeight = 2048;
-      int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-      switch (GetDesiredBlockDim(input_width)) {
-#ifdef PADDLE_WITH_HIP
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 20,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
-                trans_input.data<T>(), input_width, input_width,
-                static_cast<int>(k), gridx, input_height, largest));
-#else
-        FIXED_BLOCK_DIM(
-            KeMatrixTopK<T, 5,
-                         kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-                trans_out.data<T>(), k, trans_ind.data<int64_t>(),
-                trans_input.data<T>(), input_width, input_width,
-                static_cast<int>(k), gridx, input_height, largest));
-#endif
-        default:
-          PADDLE_THROW(platform::errors::Fatal(
-              "the input data shape has error in the topk cuda kernel."));
-      }
-
-      // last step, tranpose back the indices and output
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans);
-    }
-  }
-};
-
-#undef FIXED_BLOCK_DIM_BASE
-#undef FIXED_BLOCK_DIM
-template <typename DeviceContext, typename T>
-class TopkV2OpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-
-    const auto& in_dims = x->dims();
-    const auto& out_dims = indices->dims();
-
-    // get the real the axis and the k
-    if (axis < 0) axis += in_dims.size();
-    const int& k = out_dims[axis];
-    const int& raw_height = in_dims[axis];
-
-    // allocate the cuda memory for the x_grad
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-
-    // calcluate the block and grid num
-    auto& dev_ctx = context.cuda_device_context();
-    auto ComputeBlockSize = [](int col) {
-      if (col > 512)
-        return 1024;
-      else if (col > 256 && col <= 512)
-        return 512;
-      else if (col > 128 && col <= 256)
-        return 256;
-      else if (col > 64 && col <= 128)
-        return 128;
-      else
-        return 64;
-    };
-    int block_size = ComputeBlockSize(post * k);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-
-    // lanuch the cuda kernel to assign the grad
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, k);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    top_k_v2,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          float>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          double>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          int>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t>,
-    paddle::operators::TopkV2OpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                          paddle::platform::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel<
-                       paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, int>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, int64_t>,
-    paddle::operators::TopkV2OpGradCUDAKernel<
-        paddle::platform::CUDADeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h
deleted file mode 100644
index a808207476f3b9..00000000000000
--- a/paddle/fluid/operators/top_k_v2_op.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
-  The reason why we need the topk v2 is because the compatibility. We redefine
-  the NaN is maximum value
-  in the process of comparing. If do not add the topk v2,  will affect the
-  inference result of model that traing
-  by the older version paddlepaddle.
-*/
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
-                    int* post) {
-  *pre = 1;
-  *post = 1;
-  *n = dim[axis];
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= dim[i];
-  }
-  for (int i = axis + 1; i < dim.size(); ++i) {
-    (*post) *= dim[i];
-  }
-}
-
-template <typename T, typename Type>
-static void FullTopK(Type input_height, Type input_width, int input_dim,
-                     const framework::Tensor* input, T* t_out, Type* t_indices,
-                     const int& k, const bool& largest, const bool& sorted) {
-  // when the k is small, will the partial sort
-  bool partial_sort_flag = (k * 64) < input_width;
-
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  // Eigen::DSizes<int, 2> flat2dims(input_height, input_width);
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    if (partial_sort_flag) {
-      std::partial_sort(
-          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
-          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            if (largest) {
-              return (std::isnan(static_cast<double>(l.first)) &&
-                      !std::isnan(static_cast<double>(r.first))) ||
-                     (l.first > r.first);
-            } else {
-              return (!std::isnan(static_cast<double>(l.first)) &&
-                      std::isnan(static_cast<double>(r.first))) ||
-                     (l.first < r.first);
-            }
-          });
-    } else {
-      // use the nth-element to get the K-larger or K-small element
-      if (largest) {
-        std::nth_element(
-            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-              return (std::isnan(static_cast<double>(l.first)) &&
-                      !std::isnan(static_cast<double>(r.first))) ||
-                     (l.first > r.first);
-            });
-        // the nth-element will get the unorder elements, sort the element
-        if (sorted) {
-          std::sort(col_vec.begin(), col_vec.begin() + k - 1,
-                    [&largest](const std::pair<T, Type>& l,
-                               const std::pair<T, Type>& r) {
-                      return (std::isnan(static_cast<double>(l.first)) &&
-                              !std::isnan(static_cast<double>(r.first))) ||
-                             (l.first > r.first);
-                    });
-        }
-      } else {
-        std::nth_element(
-            col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-              return (!std::isnan(static_cast<double>(l.first)) &&
-                      std::isnan(static_cast<double>(r.first))) ||
-                     (l.first < r.first);
-            });
-        // the nth-element will get the unorder elements, sort the element
-        if (sorted) {
-          std::sort(
-              col_vec.begin(), col_vec.begin() + k - 1,
-              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                return (!std::isnan(static_cast<double>(l.first)) &&
-                        std::isnan(static_cast<double>(r.first))) ||
-                       (l.first < r.first);
-              });
-        }
-      }
-    }
-    for (Type j = 0; j < k; ++j) {
-      t_out[i * k + j] = col_vec[j].first;
-      t_indices[i * k + j] = col_vec[j].second;
-    }
-  }
-}
-
-template <typename T, typename Type>
-static void FullTopKAssign(const Type& input_height, const Type& input_width,
-                           const int& input_dim, const framework::Tensor* input,
-                           const framework::Tensor* indices, T* output_data,
-                           const int& k) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      for (Type j = 0; j < k; ++j) {
-        output_data[i * input_width + e_indices(j)] = e_input(j);
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      for (Type j = 0; j < k; ++j) {
-        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class TopkV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Get the top k elements of each row of input tensor
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    auto* indices = context.Output<Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    int k = static_cast<int>(context.Attr<int>("k"));
-    const auto& sorted = static_cast<bool>(context.Attr<bool>("sorted"));
-    const auto& largest = static_cast<bool>(context.Attr<bool>("largest"));
-
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-
-    // if K tensor is not null, will the use K tesnor as k
-    auto* k_t = context.Input<Tensor>("K");
-    if (k_t) {
-      k = k_t->data<int>()[0];
-      framework::DDim output_dims = output->dims();
-      // accroding to axis to set K value in the dim
-      output_dims[axis] = k;
-      output->Resize(output_dims);
-      indices->Resize(output_dims);
-    }
-
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    const auto& out_dims = output->dims();
-    if (axis + 1 == in_dims.size()) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                           output_data, indices_data, k, largest, sorted);
-    } else {
-      // if the topk dims is not last dim, will tranpose and do topk
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-
-      // get the trans input_dims, out_dims
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(output->dims());
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_out_dims[i] = out_dims[trans[i]];
-      }
-
-      Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // transpose the input value
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-
-      // Allocate the temp tensor to the save the topk indices, values
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
-      Tensor tmp_indices;
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
-
-      // get the TopK value
-      FullTopK<T, int64_t>(input_height, input_width, in_dims.size(),
-                           &trans_inp, t_out, t_ind, k, largest, sorted);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TopkV2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-
-    const auto& in_dims = x->dims();
-    const auto& out_dims = indices->dims();
-
-    // axis < 0, get the real axis
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    const size_t& k = out_dims[axis];
-
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis + 1 == in_dims.size()) {
-      // allocate the memory for the input_grad
-
-      // assign the out_grad to input_grad directly
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      // init the output grad with 0, because some input elements has no grad
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      // Assign the output_grad to input_grad
-      FullTopKAssign(input_height, input_width, in_dims.size(), out_grad,
-                     indices, x_grad_data, k);
-    } else {
-      // can not assign grad to input_grad, must do the transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      framework::DDim trans_dims(out_dims);
-      framework::DDim trans_in_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = out_dims[trans[i]];
-        trans_in_dims[i] = in_dims[trans[i]];
-      }
-      // transpose the out_grad, indices
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // Do transpose
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *out_grad,
-                                                  &trans_dO, trans);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, *indices, &trans_ind, trans);
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
-      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
-
-      // Assign the out_grad to tranpose input_grad
-      Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-
-      FullTopKAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                                 &trans_dO, &trans_ind, t_out, k);
-
-      // Transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc
index 5b8a6b3e754495..caaae02124c926 100644
--- a/paddle/fluid/operators/top_k_v2_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
index e11070638834c4..dff5c2d3f39378 100644
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/top_k_v2_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc
index 49daac2ff0da63..4d9c39be92eff0 100644
--- a/paddle/fluid/operators/top_k_v2_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "xpu/refactor/math.h"
 
diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index 63b914a31a86ae..0590b66f6f8688 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -107,8 +107,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TraceGradNoNeedBufferVarsInferer, "Input");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor,
-                            PT_INFER_META(phi::TraceInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor,
+                            PD_INFER_META(phi::TraceInferMeta));
 REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker,
                   ops::TraceGradOpMaker<paddle::framework::OpDesc>,
                   ops::TraceGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index 5617d728a51dc1..fb39034c8e92c1 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc
index 9233917b0931b9..df84659a00f4c4 100644
--- a/paddle/fluid/operators/triangular_solve_op.cc
+++ b/paddle/fluid/operators/triangular_solve_op.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/triangular_solve_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/solve_op.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,58 +25,6 @@ class TriangularSolveOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "TriangularSolve");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "TriangularSolve");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TriangularSolve");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    auto x_dims_n = x_dims.size();
-    auto y_dims_n = y_dims.size();
-
-    PADDLE_ENFORCE_GE(
-        x_dims_n, 2, platform::errors::InvalidArgument(
-                         "The input tensor X's dimensions of TriangularSolveOp "
-                         "should be >= 2. But received X's "
-                         "dimensions = %d, X's shape = [%s]",
-                         x_dims.size(), x_dims));
-
-    PADDLE_ENFORCE_GE(
-        y_dims_n, 2, platform::errors::InvalidArgument(
-                         "The input tensor Y's dimensions of TriangularSolveOp "
-                         "should be >=2. But received Y's "
-                         "dimensions = %d, Y's shape = [%s]",
-                         y_dims.size(), y_dims));
-
-    PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], x_dims[x_dims_n - 1],
-                      platform::errors::InvalidArgument(
-                          "The inner-most 2 dimensions of Input(X) all should "
-                          "be square matrices "
-                          "But received X's shape[-2] = %d and shape[-1] = %d.",
-                          x_dims[x_dims_n - 2], x_dims[x_dims_n - 1]));
-
-    std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
-    std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
-
-    std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(),
-                                        x_dims_vec.end() - 2);
-    std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(),
-                                        y_dims_vec.end() - 2);
-
-    std::vector<int64_t> expand_batch_portion =
-        get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut);
-
-    std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
-    y_broadcast_dims.insert(y_broadcast_dims.end(), {y_dims_vec[y_dims_n - 2],
-                                                     y_dims_vec[y_dims_n - 1]});
-
-    // dim of 'Out' is the same with 'Y' after broadcast
-    ctx->SetOutputDim("Out", phi::make_ddim(y_broadcast_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const {
     return framework::OpKernelType(
@@ -168,20 +119,15 @@ class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor,
+                            PD_INFER_META(phi::TriangularSolveInferMeta));
+
 REGISTER_OPERATOR(triangular_solve, ops::TriangularSolveOp,
                   ops::TriangularSolveOpMaker,
                   ops::TriangularSolveOpInferVarType,
                   ops::TriangularSolveOpGradMaker<paddle::framework::OpDesc>,
-                  ops::TriangularSolveOpGradMaker<paddle::imperative::OpBase>);
+                  ops::TriangularSolveOpGradMaker<paddle::imperative::OpBase>,
+                  TriangularSolveInferShapeFunctor);
 
 REGISTER_OPERATOR(triangular_solve_grad, ops::TriangularSolveGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    triangular_solve,
-    ops::TriangularSolveKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TriangularSolveKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    triangular_solve_grad,
-    ops::TriangularSolveGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TriangularSolveGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu
deleted file mode 100644
index 7df98517e84189..00000000000000
--- a/paddle/fluid/operators/triangular_solve_op.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/triangular_solve_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const Tensor& in, Tensor* out,
-                  const framework::ExecutionContext& ctx) {
-    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
-    // out_reduce_dim should be [0, 2]
-    const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
-    auto in_size = in_dims.size();
-    const std::vector<std::int64_t> out_dims = phi::vectorize(out->dims());
-    auto out_size = out_dims.size();
-
-    std::vector<std::int64_t> out_bst_dims(in_size);
-
-    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
-    std::copy(out_dims.data(), out_dims.data() + out_size,
-              out_bst_dims.data() + in_size - out_size);
-
-    std::vector<int> out_reduce_dims;
-    for (size_t idx = 0; idx <= in_size - 3; idx++) {
-      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
-        out_reduce_dims.push_back(idx);
-      }
-    }
-    gpuStream_t stream = ctx.cuda_device_context().stream();
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
-        out_reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    triangular_solve,
-    ops::TriangularSolveKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TriangularSolveKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    triangular_solve_grad,
-    ops::TriangularSolveGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TriangularSolveGradKernel<paddle::platform::CUDADeviceContext,
-                                   double>);
diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h
index 4e68add096ff28..315847b4d800e4 100644
--- a/paddle/fluid/operators/triangular_solve_op.h
+++ b/paddle/fluid/operators/triangular_solve_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/solve_op.h"
 #include "paddle/fluid/operators/tril_triu_op.h"
 #include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 
 namespace paddle {
@@ -30,10 +29,10 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename DeviceContext, typename T>
-static void triangular_solve(const DeviceContext& context, const Tensor& x,
-                             const Tensor& y, Tensor* out, bool upper,
+static void triangular_solve(const DeviceContext &context, const Tensor &x,
+                             const Tensor &y, Tensor *out, bool upper,
                              bool transpose, bool unitriangular) {
-  // Tensor broadcast use eigen
+  // Tensor broadcast use eigen library
   std::vector<int64_t> x_bst_dims_vec;
   std::vector<int64_t> y_bst_dims_vec;
   std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y);
@@ -64,15 +63,15 @@ static void triangular_solve(const DeviceContext& context, const Tensor& x,
 template <typename DeviceContext, typename T>
 class MatrixReduceSumFunctor {
  public:
-  void operator()(const Tensor& input, Tensor* output,
-                  const framework::ExecutionContext& ctx);
+  void operator()(const Tensor &input, Tensor *output,
+                  const framework::ExecutionContext &ctx);
 };
 
 template <typename T>
 class MatrixReduceSumFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const Tensor& in, Tensor* out,
-                  const framework::ExecutionContext& ctx) {
+  void operator()(const Tensor &in, Tensor *out,
+                  const framework::ExecutionContext &ctx) {
     // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
     // out_reduce_dim should be [0, 2]
     const std::vector<std::int64_t> in_dims = phi::vectorize(in.dims());
@@ -101,129 +100,5 @@ class MatrixReduceSumFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class TriangularSolveKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<framework::Tensor>("X");
-    const auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    bool upper = ctx.template Attr<bool>("upper");
-    bool transpose = ctx.template Attr<bool>("transpose");
-    bool unitriangular = ctx.template Attr<bool>("unitriangular");
-
-    const auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    triangular_solve<DeviceContext, T>(dev_ctx, *x, *y, out, upper, transpose,
-                                       unitriangular);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TriangularSolveGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<framework::Tensor>("X");
-    const auto* y = ctx.Input<framework::Tensor>("Y");
-    const auto* out = ctx.Input<framework::Tensor>("Out");
-    const auto* dout =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-    bool upper = ctx.template Attr<bool>("upper");
-    bool transpose = ctx.template Attr<bool>("transpose");
-    bool unitriangular = ctx.template Attr<bool>("unitriangular");
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    std::vector<int64_t> x_bst_dims_vec;
-    std::vector<int64_t> y_bst_dims_vec;
-    std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(*x, *y);
-
-    Tensor dy_bst(y->type());
-    if (dy) {
-      dy->mutable_data<T>(y->dims(), dev_ctx.GetPlace());
-      dy_bst.Resize(phi::make_ddim(y_bst_dims_vec));
-      dy_bst.mutable_data<T>(dev_ctx.GetPlace());
-
-      // calculate x's conjugate for complex
-      Tensor x_conj(x->type());
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, x->numel());
-      phi::funcs::ConjFunctor<T> x_functor(
-          x->data<T>(), x->numel(),
-          x_conj.mutable_data<T>(x->dims(), dev_ctx.GetPlace()));
-      x_for_range(x_functor);
-
-      // reuse forward to get dy_bst, and the result has been broadcated.
-      triangular_solve<DeviceContext, T>(dev_ctx, x_conj, *dout, &dy_bst, upper,
-                                         !transpose, unitriangular);
-
-      if (dy_bst.dims() == dy->dims()) {
-        framework::TensorCopy(dy_bst, dev_ctx.GetPlace(), dev_ctx, dy);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(dy_bst, dy, ctx);
-        dy->Resize(y->dims());
-      }
-    }
-
-    Tensor dx_bst(x->type());
-    if (dx) {
-      dx->mutable_data<T>(x->dims(), dev_ctx.GetPlace());
-      dx_bst.Resize(phi::make_ddim(x_bst_dims_vec));
-      dx_bst.mutable_data<T>(dev_ctx.GetPlace());
-
-      // calculate out's conjugate for complex
-      Tensor out_conj(out->type());
-      platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
-      phi::funcs::ConjFunctor<T> out_functor(
-          out->data<T>(), out->numel(),
-          out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
-      out_for_range(out_functor);
-
-      auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-      if (transpose) {
-        auto mat_dim_a =
-            phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false);
-        auto mat_dim_b =
-            phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true);
-        blas.MatMul(out_conj, mat_dim_a, dy_bst, mat_dim_b, static_cast<T>(-1),
-                    &dx_bst, static_cast<T>(0));
-      } else {
-        auto mat_dim_a =
-            phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false);
-        auto mat_dim_b =
-            phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true);
-        blas.MatMul(dy_bst, mat_dim_a, out_conj, mat_dim_b, static_cast<T>(-1),
-                    &dx_bst, static_cast<T>(0));
-      }
-
-      Tensor dx_bst_upper(x->type());
-      // get upper or lower triangular
-      dx_bst_upper.Resize(dx_bst.dims());
-      dx_bst_upper.mutable_data<T>(dev_ctx.GetPlace());
-
-      const auto& dims = dx_bst.dims();
-      const auto H = dims[dims.size() - 2];
-      const auto W = dims[dims.size() - 1];
-      platform::ForRange<DeviceContext> x_for_range(dev_ctx, dx_bst.numel());
-      TrilTriuCompute<T> tril_triu_computer(dx_bst.data<T>(), unitriangular,
-                                            !upper, H, W,
-                                            dx_bst_upper.data<T>());
-      x_for_range(tril_triu_computer);
-
-      if (dx_bst_upper.dims() == dx->dims()) {
-        framework::TensorCopy(dx_bst_upper, dev_ctx.GetPlace(), dev_ctx, dx);
-      } else {
-        MatrixReduceSumFunctor<DeviceContext, T> functor;
-        functor(dx_bst_upper, dx, ctx);
-        dx->Resize(x->dims());
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc
new file mode 100644
index 00000000000000..e36cbcf228cfbf
--- /dev/null
+++ b/paddle/fluid/operators/tril_triu_op_xpu.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.  Licensed under
+the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class TrilTriuXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::Tensor>("X");
+    const auto* x_data = x->data<T>();
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_data = out->mutable_data<T>(context.GetPlace());
+
+    const int diagonal = context.Attr<int>("diagonal");
+    const bool lower = context.Attr<bool>("lower");
+    auto xshape = phi::vectorize<int>(x->dims());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = 0;
+    if (lower) {
+      r = xpu::tril(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op");
+    } else {
+      r = xpu::triu(dev_ctx.x_context(), x_data, out_data, xshape, diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    tril_triu, ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc
index 54f4deac80a74e..b77775f5a8c094 100644
--- a/paddle/fluid/operators/trunc_op.cc
+++ b/paddle/fluid/operators/trunc_op.cc
@@ -69,8 +69,8 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
-DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker,
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index 6eb7f922dfdbec..dc5a66dce16d69 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -17,8 +17,10 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/truncated_gaussian_random_op.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace paddle {
 namespace operators {
@@ -27,26 +29,6 @@ class TruncatedGaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::NotFound(
-            "Output(Out) of TruncatedGaussianRandomOp should not be null."));
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    std::vector<int64_t> out_dim;
-    out_dim.reserve(shape.size());
-    for (auto dim : shape) {
-      out_dim.push_back(static_cast<int64_t>(dim));
-    }
-    PADDLE_ENFORCE_GT(
-        shape.size(), 0UL,
-        platform::errors::InvalidArgument(
-            "the input shape of TruncatedGaussianRandomOp must be set, "
-            "But the rank of shape we received is %d",
-            shape.size()));
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dim));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -99,6 +81,14 @@ Used to initialize tensors with truncated gaussian random generator.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random,
-                             ops::TruncatedGaussianRandomOp,
-                             ops::TruncatedGaussianRandomOpMaker);
+
+DECLARE_INFER_SHAPE_FUNCTOR(
+    truncated_gaussian_random, TruncatedGaussianRandomInferShapeFunctor,
+    PD_INFER_META(phi::TruncatedGaussianRandomInferMeta));
+
+REGISTER_OPERATOR(
+    truncated_gaussian_random, ops::TruncatedGaussianRandomOp,
+    ops::TruncatedGaussianRandomOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    TruncatedGaussianRandomInferShapeFunctor);
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index c45b839d5b40bd..02fed3de6cef74 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -119,8 +119,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnfoldGradOpNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor,
-                            PT_INFER_META(phi::UnfoldInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor,
+                            PD_INFER_META(phi::UnfoldInferMeta));
 REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker,
                   ops::UnfoldGradMaker<paddle::framework::OpDesc>,
                   ops::UnfoldGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 5ab2004617810b..1be8f3387dbad8 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -236,7 +236,6 @@ register_unity_group(cc
     scatter_nd_add_op.cc
     scatter_op.cc
     seed_op.cc
-    segment_pool_op.cc
     select_input_op.cc
     select_output_op.cc)
 register_unity_group(cc
@@ -496,8 +495,7 @@ register_unity_group(cu
     scale_op.cu
     scatter_nd_add_op.cu
     scatter_op.cu
-    seed_op.cu
-    segment_pool_op.cu)
+    seed_op.cu)
 register_unity_group(cu
     roi_pool_op.cu
     selu_op.cu
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
index 3e11c952d15f34..a8ced783744a96 100644
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc
index bf1cdeed65a842..602376d54e0d2a 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cc
+++ b/paddle/fluid/operators/viterbi_decode_op.cc
@@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/viterbi_decode_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -19,47 +21,6 @@ class ViterbiDecodeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition",
-                   "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores",
-                   "ViterbiDecode");
-    OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode");
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 3,
-                      platform::errors::InvalidArgument(
-                          "The rank of Input in ViterbiDecode  must be 3. But "
-                          "received Input's rank is %d.",
-                          in_dims.size()));
-    auto length_dims = ctx->GetInputDim("Length");
-    PADDLE_ENFORCE_EQ(length_dims.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "The rank of Length in ViterbiDecode must be 1. But "
-                          "received Length's rank is %d.",
-                          length_dims.size()));
-    auto transition_dims = ctx->GetInputDim("Transition");
-    PADDLE_ENFORCE_EQ(
-        transition_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The rank of Transition in ViterbiDecode must be 2. But "
-            "received Transition's rank is %d.",
-            transition_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          in_dims[0], length_dims[0],
-          platform::errors::InvalidArgument(
-              "The batch size of Input and Length should be equal."));
-      PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0],
-                        platform::errors::InvalidArgument(
-                            "The number of tags of Input (%d) and Transition "
-                            "(%d) should be equal.",
-                            transition_dims[0], in_dims[2]));
-    }
-    ctx->SetOutputDim("Scores", length_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -102,8 +63,8 @@ class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 namespace platform = paddle::platform;
+DECLARE_INFER_SHAPE_FUNCTOR(viterbi_decode, ViterbiDecodeInferShapeFunctor,
+                            PD_INFER_META(phi::ViterbiDecodeInferMeta));
 REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp,
-                             ops::ViterbiDecodeOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    viterbi_decode, ops::ViterbiDecodeKernel<platform::CPUDeviceContext, float>,
-    ops::ViterbiDecodeKernel<platform::CPUDeviceContext, double>);
+                             ops::ViterbiDecodeOpMaker,
+                             ViterbiDecodeInferShapeFunctor);
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
deleted file mode 100644
index 68628fb2748c42..00000000000000
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/viterbi_decode_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-namespace paddle {
-namespace operators {
-
-#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
-  case (1 << (log2_block_dim)): {                       \
-    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
-    __VA_ARGS__;                                        \
-  } break
-
-#define FIXED_BLOCK_DIM_CASE(...)               \
-  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
-
-int64_t ComputeBlockSize(int64_t col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256)
-    return 512;
-  else if (col > 128)
-    return 256;
-  else if (col > 64)
-    return 128;
-  else if (col > 32)
-    return 64;
-  else if (col > 16)
-    return 32;
-  else if (col > 8)
-    return 16;
-  else
-    return 8;
-}
-
-template <template <typename T> typename BinaryFunctor, typename T>
-struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* output) {
-    std::vector<const framework::Tensor*> ins{&lhs, &rhs};
-    std::vector<framework::Tensor*> outs{output};
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(dev_ctx, ins, &outs, -1,
-                                                      BinaryFunctor<T>());
-  }
-};
-
-template <template <typename InT, typename OutT> typename CompareFunctor,
-          typename T>
-struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* mask) {
-    std::vector<const framework::Tensor*> ins = {&lhs, &rhs};
-    std::vector<framework::Tensor*> outs = {mask};
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
-        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
-  }
-};
-
-template <typename T, typename IndType, size_t BlockDim>
-__global__ void ArgmaxCUDAKernel(const int64_t height,     // n * h
-                                 const int64_t width,      // c
-                                 const int64_t post_size,  // h
-                                 const T* in, IndType* out_idx, T* out) {
-  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  cub::ArgMax reducer;
-  T init = (std::numeric_limits<T>::lowest)();  // for windows compile
-  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
-    cub::KeyValuePair<int, T> kv_pair = {-1, init};
-    int h = idx / post_size;
-    int w = idx % post_size;
-    for (int k = threadIdx.x; k < width; k += blockDim.x) {
-      kv_pair =
-          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
-    }
-    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
-    if (threadIdx.x == 0) {
-      // return max, argmax
-      if (out_idx != nullptr) out_idx[idx] = static_cast<IndType>(kv_pair.key);
-      if (out != nullptr) out[idx] = kv_pair.value;
-    }
-    __syncthreads();
-  }
-}
-
-__global__ void ARangeKernel(int64_t* data, int num, int64_t scale) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int start = idx; idx < num; idx += gridDim.x) {
-    data[idx] = idx * scale;
-  }
-}
-
-template <>
-struct ARange<platform::CUDADeviceContext> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx, int64_t* data,
-                  int num, int64_t scale) {
-    int64_t kBlockDim = ComputeBlockSize(num);
-    // kBlockDim > num at most of time, so we can set grid = 1
-    ARangeKernel<<<1, kBlockDim, 0, dev_ctx.stream()>>>(data, num, scale);
-  }
-};
-
-template <typename T, typename IndType>
-struct Argmax<platform::CUDADeviceContext, T, IndType> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out_idx,
-                  framework::Tensor* out, int axis) {
-    framework::DDim input_dims = input.dims();
-    int64_t numel = input.numel();
-    int64_t groups = numel / input_dims[axis];
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-    const auto& dev_ctx = ctx.cuda_device_context();
-    auto cu_stream = dev_ctx.stream();
-    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-    int64_t height = pre * post;
-    int64_t width = n;
-    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
-    const T* in_data = input.data<T>();
-    IndType* out_idx_data = out_idx->data<IndType>();
-    T* out_data = out->data<T>();
-    switch (ComputeBlockSize(width)) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T, IndType,
-                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
-              height, width, post, in_data, out_idx_data, out_data));
-    }
-  }
-};
-
-template <typename T>
-struct GetMaxValue<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& input, T* max_value) {
-    framework::Tensor out_data;
-    out_data.Resize(phi::make_ddim({1}));
-    out_data.mutable_data<T>(platform::CUDAPlace());
-    switch (ComputeBlockSize(input.numel())) {
-      FIXED_BLOCK_DIM_CASE(
-          ArgmaxCUDAKernel<T, T,
-                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
-              1, input.numel(), 1, input.data<int64_t>(), nullptr,
-              out_data.data<int64_t>()));
-    }
-    framework::Tensor max_value_tensor;
-    framework::TensorCopy(out_data, platform::CPUPlace(), &max_value_tensor);
-    *max_value = max_value_tensor.data<T>()[0];
-  }
-};
-
-template <typename T, typename IndexT>
-struct Gather<platform::CUDADeviceContext, T, IndexT> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& src, const framework::Tensor& index,
-                  framework::Tensor* output) {
-    phi::funcs::GPUGather<T, IndexT>(ctx, src, index, output);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace platform = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    viterbi_decode,
-    ops::ViterbiDecodeKernel<platform::CUDADeviceContext, float>,
-    ops::ViterbiDecodeKernel<platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
deleted file mode 100644
index e7fe743b964c3d..00000000000000
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ /dev/null
@@ -1,438 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/operators/unique_op.h"
-#include "paddle/phi/kernels/funcs/compare_functors.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T, typename IndType>
-struct Argmax {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input, framework::Tensor* out_idx,
-                  framework::Tensor* out, int axis) {
-    framework::DDim input_dims = input.dims();
-    int64_t pre = 1;
-    int64_t post = 1;
-    int64_t n = input_dims[axis];
-    for (int i = 0; i < axis; i++) {
-      pre *= input_dims[i];
-    }
-    for (int i = axis + 1; i < input_dims.size(); i++) {
-      post *= input_dims[i];
-    }
-    int64_t height = pre * post;
-    int64_t width = n;
-    const T* in_data = input.data<T>();
-    IndType* out_idx_data = out_idx->data<IndType>();
-    T* out_data = out->data<T>();
-// Reduce
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-    for (int64_t i = 0; i < height; ++i) {
-      int64_t h = i / post;
-      int64_t w = i % post;
-      IndType max_idx = -1;
-      T max_value = (std::numeric_limits<T>::lowest)();  // for windows compile
-      for (int64_t j = 0; j < width; ++j) {
-        if (in_data[h * width * post + j * post + w] > max_value) {
-          max_value = in_data[h * width * post + j * post + w];
-          max_idx = j;
-        }
-      }
-      out_data[i] = max_value;
-      out_idx_data[i] = max_idx;
-    }
-  }
-};
-
-template <typename DeviceContext>
-struct ARange {
-  void operator()(const DeviceContext& dev_ctx, int64_t* data, int end,
-                  int64_t scale) {
-    for (int i = 0; i < end; ++i) {
-      data[i] = i * scale;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct GetMaxValue {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& input,
-                  T* max_value) {
-    auto input_ptr = input.data<T>();
-    auto num = input.numel();
-    *max_value = *std::max_element(input_ptr, input_ptr + num);
-  }
-};
-
-template <typename DeviceContext, typename T, typename IndexT = int>
-struct Gather {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& src,
-                  const framework::Tensor& index, framework::Tensor* output) {
-    phi::funcs::CPUGather<T, IndexT>(ctx, src, index, output);
-  }
-};
-
-template <typename T, typename Functor, typename OutT = T>
-void SameDimsBinaryOP(const framework::Tensor& lhs,
-                      const framework::Tensor& rhs, framework::Tensor* out) {
-  const T* lhs_ptr = lhs.data<T>();
-  const T* rhs_ptr = rhs.data<T>();
-  OutT* out_ptr = out->data<OutT>();
-  Functor functor;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int i = 0; i < out->numel(); ++i) {
-    out_ptr[i] = functor(lhs_ptr[i], rhs_ptr[i]);
-  }
-}
-
-template <typename DeviceContext,
-          template <typename InT, typename OutT> typename CompareFunctor,
-          typename T>
-struct GetMask {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-                  framework::Tensor* mask) {
-    SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
-  }
-};
-
-template <bool is_multi_threads>
-struct GetInputIndex {
-  void operator()(const std::vector<int>& lhs_dims,
-                  const std::vector<int>& rhs_dims,
-                  const std::vector<int>& output_dims,
-                  const std::vector<int>& lhs_strides,
-                  const std::vector<int>& rhs_strides,
-                  const std::vector<int>& output_strides, int output_idx,
-                  int* index_array, int* lhs_idx, int* rhs_idx) {
-    int out_dims_size = output_strides.size();
-    for (int j = 0; j < out_dims_size; ++j) {
-      int curr_idx = output_idx / output_strides[j];
-      output_idx %= output_strides[j];
-      *lhs_idx += (lhs_dims[j] > 1) ? curr_idx * lhs_strides[j] : 0;
-      *rhs_idx += (rhs_dims[j] > 1) ? curr_idx * rhs_strides[j] : 0;
-    }
-  }
-};
-
-template <>
-struct GetInputIndex<false> {
-  void operator()(const std::vector<int>& lhs_dims,
-                  const std::vector<int>& rhs_dims,
-                  const std::vector<int>& output_dims,
-                  const std::vector<int>& lhs_strides,
-                  const std::vector<int>& rhs_strides,
-                  const std::vector<int>& output_strides, int output_idx,
-                  int* index_array, int* lhs_idx, int* rhs_idx) {
-    int out_dims_size = output_strides.size();
-    *lhs_idx = phi::funcs::GetElementwiseIndex(lhs_dims.data(), out_dims_size,
-                                               index_array);
-    *rhs_idx = phi::funcs::GetElementwiseIndex(rhs_dims.data(), out_dims_size,
-                                               index_array);
-    phi::funcs::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
-                                            index_array);
-  }
-};
-
-template <typename T, typename Functor, bool is_multi_threads = false>
-void SimpleBroadcastBinaryOP(const framework::Tensor& lhs,
-                             const framework::Tensor& rhs,
-                             framework::Tensor* out) {
-  const T* lhs_ptr = lhs.data<T>();
-  const T* rhs_ptr = rhs.data<T>();
-  T* out_ptr = out->data<T>();
-  int out_size = static_cast<int>(out->dims().size());
-  std::vector<int> out_dims(out_size);
-  std::vector<int> lhs_dims(out_size);
-  std::vector<int> rhs_dims(out_size);
-  std::copy(lhs.dims().Get(), lhs.dims().Get() + out_size, lhs_dims.data());
-  std::copy(rhs.dims().Get(), rhs.dims().Get() + out_size, rhs_dims.data());
-  std::copy(out->dims().Get(), out->dims().Get() + out_size, out_dims.data());
-  std::vector<int> output_strides(out_size, 1);
-  std::vector<int> lhs_strides(out_size, 1);
-  std::vector<int> rhs_strides(out_size, 1);
-  std::vector<int> index_array(out_size, 0);
-  // calculate strides
-  for (int i = out_size - 2; i >= 0; --i) {
-    output_strides[i] = output_strides[i + 1] * out_dims[i + 1];
-    lhs_strides[i] = lhs_strides[i + 1] * lhs_dims[i + 1];
-    rhs_strides[i] = rhs_strides[i + 1] * rhs_dims[i + 1];
-  }
-  Functor functor;
-  GetInputIndex<is_multi_threads> get_input_index;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int i = 0; i < out->numel(); ++i) {
-    int lhs_idx = 0;
-    int rhs_idx = 0;
-    get_input_index(lhs_dims, rhs_dims, out_dims, lhs_strides, rhs_strides,
-                    output_strides, i, index_array.data(), &lhs_idx, &rhs_idx);
-    out_ptr[i] = functor(lhs_ptr[lhs_idx], rhs_ptr[rhs_idx]);
-  }
-}
-
-template <typename DeviceContext, template <typename T> typename BinaryFunctor,
-          typename T>
-struct BinaryOperation {
-  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& lhs,
-                  const framework::Tensor& rhs, framework::Tensor* output) {
-    if (lhs.dims() == rhs.dims()) {
-      SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
-    } else {
-      bool is_multi_threads = false;
-#ifdef PADDLE_WITH_MKLML
-      if (omp_get_max_threads() > 1) {
-        is_multi_threads = true;
-      }
-#endif
-      if (is_multi_threads) {
-        SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, true>(lhs, rhs, output);
-      } else {
-        SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, false>(lhs, rhs, output);
-      }
-    }
-  }
-};
-
-class TensorBuffer {
- public:
-  explicit TensorBuffer(const framework::LoDTensor& in)
-      : buffer_(in), offset_(0) {
-    buffer_.Resize({buffer_.numel()});
-  }
-  framework::Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
-    int64_t size = std::accumulate(shape.begin(), shape.end(), 1,
-                                   std::multiplies<int64_t>());
-    framework::Tensor block = buffer_.Slice(offset_, offset_ + size);
-    offset_ += size;
-    block.Resize(shape);
-    return block;
-  }
-
- private:
-  framework::LoDTensor buffer_;  // need to resize 1-D Tensor
-  int offset_;
-};
-
-template <typename DeviceContext, typename T>
-class ViterbiDecodeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    bool include_bos_eos_tag = ctx.Attr<bool>("include_bos_eos_tag");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto curr_place = ctx.GetPlace();
-    auto* input = ctx.Input<framework::Tensor>("Input");
-    auto batch_size = static_cast<int>(input->dims()[0]);
-    auto seq_len = static_cast<int>(input->dims()[1]);
-    auto n_labels = static_cast<int>(input->dims()[2]);
-    phi::funcs::SetConstant<DeviceContext, T> float_functor;
-    phi::funcs::SetConstant<DeviceContext, int64_t> int_functor;
-    std::vector<framework::Tensor> historys;
-    // We create tensor buffer in order to avoid allocating memory frequently
-    // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
-    int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
-    framework::LoDTensor int_buffer;
-    int_buffer.Resize(phi::make_ddim({buffer_size}));
-    int_buffer.mutable_data<int64_t>(ctx.GetPlace());
-    TensorBuffer int_tensor_buffer(int_buffer);
-    // create float tensor buffer
-    // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
-    buffer_size = batch_size * (seq_len + 10) * n_labels +
-                  (batch_size + 2) * n_labels * n_labels;
-    framework::LoDTensor float_buffer;
-    float_buffer.Resize(phi::make_ddim({buffer_size}));
-    float_buffer.mutable_data<T>(ctx.GetPlace());
-    TensorBuffer float_tensor_buffer(float_buffer);
-    auto* length = ctx.Input<framework::Tensor>("Length");
-    framework::Tensor left_length =
-        int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    framework::TensorCopy(*length, curr_place, dev_ctx, &left_length);
-    int64_t max_seq_len = 0;
-    GetMaxValue<DeviceContext, int64_t> get_max_value;
-    get_max_value(dev_ctx, left_length, &max_seq_len);
-
-    auto* scores = ctx.Output<framework::Tensor>("Scores");
-    scores->mutable_data<T>(curr_place);
-    auto* path = ctx.Output<framework::Tensor>("Path");
-    path->Resize({batch_size, max_seq_len});
-    path->mutable_data<int64_t>(curr_place);
-    framework::Tensor tpath =
-        int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
-    auto batch_path = Unbind(tpath);
-    for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
-      it->Resize({batch_size});
-    }
-    // create and init required tensor
-    framework::Tensor input_exp =
-        float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
-    TransCompute<DeviceContext, T>(3, dev_ctx, *input, &input_exp, {1, 0, 2});
-    auto* transition = ctx.Input<framework::Tensor>("Transition");
-    framework::Tensor trans_exp =
-        float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
-    framework::TensorCopy(*transition, curr_place, dev_ctx, &trans_exp);
-    trans_exp.Resize({1, n_labels, n_labels});
-    framework::Tensor alpha =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    int_functor(dev_ctx, &zero, 0);
-    framework::Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
-    int_functor(dev_ctx, &one, 1);
-    framework::Tensor float_one =
-        float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
-    framework::Tensor alpha_trn_sum =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
-    framework::Tensor alpha_max =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor alpha_argmax =
-        int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
-    auto alpha_argmax_unbind = Unbind(alpha_argmax);
-    framework::Tensor alpha_nxt =
-        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    framework::Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor zero_len_mask =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor float_mask =
-        float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    framework::Tensor stop_trans =
-        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    framework::Tensor start_trans =
-        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    framework::Tensor rest_trans =
-        float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
-    framework::Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor last_ids_tmp =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor batch_offset =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    framework::Tensor gather_idx =
-        int_tensor_buffer.GetBufferBlock({batch_size});
-    std::vector<const framework::Tensor*> shape{&rest_trans, &stop_trans,
-                                                &start_trans};
-    std::vector<framework::Tensor*> outputs{&rest_trans, &stop_trans,
-                                            &start_trans};
-    math::SplitFunctor<DeviceContext, T> split_functor;
-    split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
-    stop_trans.Resize({1, n_labels});
-    start_trans.Resize({1, n_labels});
-    auto logit0 = input_exp.Slice(0, 1);
-    logit0.Resize({batch_size, n_labels});
-    BinaryOperation<DeviceContext, AddFunctor, T> AddFloat;
-    BinaryOperation<DeviceContext, AddFunctor, int64_t> AddInt;
-    BinaryOperation<DeviceContext, MulFunctor, T> MulFloat;
-    BinaryOperation<DeviceContext, MulFunctor, int64_t> MulInt;
-    BinaryOperation<DeviceContext, SubFunctor, T> SubFloat;
-    BinaryOperation<DeviceContext, SubFunctor, int64_t> SubInt;
-    if (include_bos_eos_tag) {
-      AddFloat(dev_ctx, logit0, start_trans, &alpha);
-      GetMask<DeviceContext, phi::funcs::EqualFunctor, T>()(ctx, left_length,
-                                                            one, &float_mask);
-      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-    } else {
-      alpha = logit0;
-    }
-    SubInt(dev_ctx, left_length, one, &left_length);
-    Argmax<DeviceContext, T, int64_t> argmax;
-    for (int64_t i = 1; i < max_seq_len; ++i) {
-      framework::Tensor logit = input_exp.Slice(i, i + 1);
-      logit.Resize({batch_size, n_labels});
-      framework::Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
-      AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
-      auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
-      alpha_argmax_temp.Resize({batch_size, n_labels});
-      argmax(ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
-      historys.emplace_back(alpha_argmax_temp);
-      AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
-      alpha.Resize({batch_size, n_labels});
-      // mask = paddle.cast((left_length > 0), dtype='float32')
-      // alpha = mask * alpha_nxt + (1 - mask) * alpha
-      GetMask<DeviceContext, phi::funcs::GreaterThanFunctor, T>()(
-          ctx, left_length, zero, &float_mask);
-      // alpha_nxt = mask * alpha_nxt
-      MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
-      // inv_mask = 1 - mask
-      SubFloat(dev_ctx, float_one, float_mask, &float_mask);
-      // alpha = (1 - mask) * alpha
-      MulFloat(dev_ctx, alpha, float_mask, &alpha);
-      // alpha += alpha_nxt
-      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-      if (include_bos_eos_tag) {
-        GetMask<DeviceContext, phi::funcs::EqualFunctor, T>()(ctx, left_length,
-                                                              one, &float_mask);
-        // alpha += mask * trans_exp[:, self.stop_idx]
-        MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
-        AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
-      }
-      SubInt(dev_ctx, left_length, one, &left_length);
-    }
-    argmax(ctx, alpha, &last_ids, scores, 1);
-    left_length.Resize({batch_size});
-    GetMask<DeviceContext, phi::funcs::GreaterEqualFunctor, int64_t>()(
-        ctx, left_length, zero, &int_mask);
-    // last_ids_update = last_ids * tag_mask
-    int last_ids_index = 1;
-    int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
-    MulInt(dev_ctx, last_ids, int_mask,
-           &batch_path[actual_len - last_ids_index]);
-    // The algorithm below can refer to
-    // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
-    ARange<DeviceContext> arange;
-    arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
-    Gather<DeviceContext, int64_t, int64_t> gather;
-    for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
-      ++last_ids_index;
-      AddInt(dev_ctx, left_length, one, &left_length);
-      AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
-      framework::Tensor& last_ids_update =
-          batch_path[actual_len - last_ids_index];
-      hist->Resize({batch_size * n_labels});
-      gather(dev_ctx, *hist, gather_idx, &last_ids_update);
-      GetMask<DeviceContext, phi::funcs::GreaterThanFunctor, int64_t>()(
-          ctx, left_length, zero, &int_mask);
-      MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
-      GetMask<DeviceContext, phi::funcs::EqualFunctor, int64_t>()(
-          ctx, left_length, zero, &zero_len_mask);
-      MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
-      SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
-      MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
-      AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
-      GetMask<DeviceContext, phi::funcs::LessThanFunctor, int64_t>()(
-          ctx, left_length, zero, &int_mask);
-      MulInt(dev_ctx, last_ids, int_mask, &last_ids);
-      AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
-    }
-    TransCompute<DeviceContext, int64_t>(2, dev_ctx, tpath, path, {1, 0});
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_index_op.cc b/paddle/fluid/operators/where_index_op.cc
index 2bffeb500ce50e..733d0f7af92d72 100644
--- a/paddle/fluid/operators/where_index_op.cc
+++ b/paddle/fluid/operators/where_index_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/where_index_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,16 +24,6 @@ class WhereIndexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "where");
-    PADDLE_ENFORCE_GE(
-        ctx->GetInputDim("Condition").size(), 1UL,
-        platform::errors::InvalidArgument(
-            "Input(Condition) should have number of dimension at least 1"));
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "where");
-    ctx->SetOutputDim("Out", {-1, ctx->GetInputDim("Condition").size()});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -53,11 +46,10 @@ class WhereIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(where_index, ops::WhereIndexOp,
-                             ops::WhereIndexOpMaker);
-REGISTER_OP_CPU_KERNEL(where_index, ops::CPUWhereIndexKernel<int64_t>,
-                       ops::CPUWhereIndexKernel<int>,
-                       ops::CPUWhereIndexKernel<int16_t>,
-                       ops::CPUWhereIndexKernel<bool>,
-                       ops::CPUWhereIndexKernel<float>,
-                       ops::CPUWhereIndexKernel<double>);
+DECLARE_INFER_SHAPE_FUNCTOR(where_index, WhereIndexInferShapeFunctor,
+                            PD_INFER_META(phi::WhereIndexInferMeta));
+REGISTER_OPERATOR(
+    where_index, ops::WhereIndexOp, ops::WhereIndexOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    WhereIndexInferShapeFunctor);
diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu
deleted file mode 100644
index c594e478aa0f3c..00000000000000
--- a/paddle/fluid/operators/where_index_op.cu
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/where_index_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
-
-template <typename T>
-__global__ void GetTrueNum(const T *cond_data, const int64_t numel,
-                           int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    true_num_array[idx] =
-        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
-  }
-}
-
-template <typename T>
-__global__ void SetTrueIndex(int64_t *out_ptr, const T *cond_data,
-                             const int64_t numel, const int64_t *stride_array,
-                             const int64_t rank,
-                             const int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    // true_num_array is calculated by cub::InclusiveSum,
-    // cause the first element of true_num_array is 1,
-    // so we need substract 1 to get true index.
-    const int64_t true_index = true_num_array[idx] - 1;
-    if (static_cast<bool>(cond_data[idx])) {
-      int64_t rank_index = idx;
-      for (int j = 0; j < rank; j++) {
-        const int64_t out_index = rank_index / stride_array[j];
-        out_ptr[true_index * rank + j] = out_index;
-        rank_index -= out_index * stride_array[j];
-      }
-    }
-  }
-}
-
-template <typename T>
-class CUDAWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *condition = context.Input<framework::Tensor>("Condition");
-    auto *out = context.Output<framework::Tensor>("Out");
-    auto &dev_ctx = context.template device_context<CUDADeviceContext>();
-
-    const T *cond_data = condition->data<T>();
-    const int64_t numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto d_array_mem = memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
-    auto h_array_mem =
-        memory::Alloc(platform::CPUPlace(), (rank + 1) * sizeof(int64_t));
-
-    // "stride_array" is an array and len(stride_array)==rank,
-    // each element is the stride of each dimension -- the length from i to i+1.
-    int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
-    int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
-
-    // "true_num_array" is an array and len(stride_array)==numel,
-    // at the beginning,
-    // "true_num_array" will set 1 if condition[i] == true else 0,
-    // then it will be calculated by cub::InclusiveSum,
-    // so that we can get the true number before i as the out index
-    int64_t *d_true_num_array = d_stride_array + rank;
-
-    // the total_true_num is the total number of condition[i] == true
-    int64_t *h_total_true_num = h_stride_array + rank;
-
-    // alloce cub memory
-    size_t cub_size = 0;
-    cub::DeviceScan::InclusiveSum(nullptr, cub_size, d_true_num_array,
-                                  d_true_num_array, numel, dev_ctx.stream());
-    auto cub_mem = memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
-    void *cub_data = cub_mem->ptr();
-
-    // set d_true_num_array[i]=1 if cond_data[i]==true else 0
-    const int threads = std::min(numel, static_cast<int64_t>(128));
-    const int64_t need_grids = (numel + threads - 1) / threads;
-    const int grids = std::min(need_grids, static_cast<int64_t>(256));
-    GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(cond_data, numel,
-                                                           d_true_num_array);
-
-    // calculate the inclusive prefix sum of "true_num_array"
-    // to get the index of "out" tensor,
-    // and the total number of cond_data[i]==true.
-    // Example:
-    // condition: F T T F F F T T
-    // before:    0 1 1 0 0 0 1 1
-    // after:     0 1 2 2 2 2 3 4
-    // out:       1 2 6 7
-    cub::DeviceScan::InclusiveSum(cub_data, cub_size, d_true_num_array,
-                                  d_true_num_array, numel, dev_ctx.stream());
-
-    // calculate each dimension's stride
-    h_stride_array[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
-    }
-    memory::Copy(dev_ctx.GetPlace(), d_stride_array, platform::CPUPlace(),
-                 h_stride_array, rank * sizeof(int64_t), dev_ctx.stream());
-
-    // get total ture number and set output size
-    // the last element of cub::InclusiveSum is the total number
-    memory::Copy(platform::CPUPlace(), h_total_true_num, dev_ctx.GetPlace(),
-                 d_true_num_array + numel - 1, sizeof(int64_t),
-                 dev_ctx.stream());
-    dev_ctx.Wait();
-
-    int64_t true_num = *h_total_true_num;
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    // using true_num_array and stride_array to calculate the output index
-    SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-        out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(where_index, ops::CUDAWhereIndexKernel<int64_t>,
-                        ops::CUDAWhereIndexKernel<int>,
-                        ops::CUDAWhereIndexKernel<int16_t>,
-                        ops::CUDAWhereIndexKernel<bool>,
-                        ops::CUDAWhereIndexKernel<float>,
-                        ops::CUDAWhereIndexKernel<double>);
diff --git a/paddle/fluid/operators/where_index_op.h b/paddle/fluid/operators/where_index_op.h
deleted file mode 100644
index 193a2386e6bd1e..00000000000000
--- a/paddle/fluid/operators/where_index_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <functional>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct WhereIndexFunctor {
-  WhereIndexFunctor(const T* true_index, int true_num, const T* stride,
-                    int rank, T* out)
-      : true_index_(true_index),
-        true_num_(true_num),
-        stride_(stride),
-        rank_(rank),
-        out_ptr_(out) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    T index = true_index_[idx];
-    for (int j = 0; j < rank_; j++) {
-      out_ptr_[idx * rank_ + j] = index / stride_[j];
-      index -= out_ptr_[idx * rank_ + j] * stride_[j];
-    }
-  }
-
-  const T* true_index_;
-  int true_num_;
-  const T* stride_;
-  int rank_;
-  T* out_ptr_;
-};
-
-using CPUDeviceContext = paddle::platform::CPUDeviceContext;
-
-template <typename T>
-class CPUWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const T* cond_data = condition->data<T>();
-    auto numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    std::vector<int64_t> true_index;
-    for (auto i = 0; i < numel; i++) {
-      if (static_cast<bool>(cond_data[i])) {
-        true_index.push_back(i);
-      }
-    }
-    auto true_num = true_index.size();
-
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_ptr = out->mutable_data<int64_t>(context.GetPlace());
-
-    if (true_num == 0) {
-      return;
-    }
-
-    std::vector<int64_t> stride(rank);
-    stride[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      stride[i] = stride[i + 1] * dims[i + 1];
-    }
-
-    auto& dev_ctx = context.template device_context<CPUDeviceContext>();
-    WhereIndexFunctor<int64_t> functor(true_index.data(), true_num,
-                                       stride.data(), rank, out_ptr);
-    platform::ForRange<CPUDeviceContext> for_range(dev_ctx, true_num);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc
index 59f598d2ad6a32..2f8744c2c04488 100644
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/where_index_op.h"
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/where_index_op_xpu.cc b/paddle/fluid/operators/where_index_op_xpu.cc
deleted file mode 100644
index 3322eefd887e3d..00000000000000
--- a/paddle/fluid/operators/where_index_op_xpu.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/where_index_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class WhereIndexXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const T* cond_data = condition->data<T>();
-    auto numel = condition->numel();
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::XPUDeviceContext>();
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    int* true_num = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
-    int true_num_cpu;
-    int ret =
-        xpu::nonzero_count(dev_ctx.x_context(), cond_data, true_num, numel);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex",
-            ret, XPUAPIErrorMsg[ret]));
-
-    memory::Copy(platform::CPUPlace(), static_cast<void*>(&true_num_cpu),
-                 context.GetPlace(), static_cast<void*>(true_num),
-                 sizeof(int32_t));
-
-    out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
-    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
-    if (true_num_cpu == 0) {
-      return;
-    }
-
-    auto condition_shape = phi::vectorize<int>(dims);
-    ret = xpu::where(dev_ctx.x_context(), cond_data, out_data, condition_shape,
-                     true_num_cpu);
-    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU masked_select kernel return wrong value[%d %s]",
-                          ret, XPUAPIErrorMsg[ret]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(where_index, ops::WhereIndexXPUKernel<int>,
-                       ops::WhereIndexXPUKernel<bool>,
-                       ops::WhereIndexXPUKernel<float>);
-#endif
diff --git a/paddle/fluid/operators/where_op.cc b/paddle/fluid/operators/where_op.cc
index 0f10efefa137b6..acbfee30670b12 100644
--- a/paddle/fluid/operators/where_op.cc
+++ b/paddle/fluid/operators/where_op.cc
@@ -117,8 +117,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
-                            PT_INFER_META(phi::WhereInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
+                            PD_INFER_META(phi::WhereInferMeta));
 REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker,
                   ops::WhereOpGradMaker<paddle::framework::OpDesc>,
                   ops::WhereOpGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 5a47443fd0b524..04c8a329e5e1a3 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -117,7 +117,7 @@ endif()
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
 # seperate init from device_context to avoid cycle dependencies
-cc_library(init SRCS init.cc DEPS device_context phi_custom_kernel)
+cc_library(init SRCS init.cc DEPS device_context custom_kernel)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index ab7d474c1ac38e..a32db3a9921e3d 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -19,6 +19,7 @@
 
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -110,5 +111,28 @@ class CublasHandleHolder {
   mutable std::mutex mtx_;
 };
 
+class CublasLtHandleHolder {
+ public:
+  CublasLtHandleHolder() {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtCreate(&handle_));
+  }
+  const cublasLtHandle_t& GetCublasLtHandle() const { return handle_; }
+
+  ~CublasLtHandleHolder() PADDLE_MAY_THROW {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtDestroy(handle_));
+  }
+
+  inline void Call(const std::function<void(blasLtHandle_t)>& callback) const {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callback(handle_);
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(CublasLtHandleHolder);
+
+  cublasLtHandle_t handle_;
+  mutable std::mutex mtx_;
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index d7362fe9cbd81d..d0b48eca5021bd 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -24,6 +25,7 @@
 #else
 #include <cuda_runtime.h>
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #endif
 
@@ -70,6 +72,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
 
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
 
+// TODO(Ming Huang): Since there is no blasLt handler,
+// use rocblas_handle for workround.
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+
 using CUDAGraphID = unsigned long long;  // NOLINT
 
 #undef DECLARE_TYPE_FOR_GPU
diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h
index efbc56bee720b0..134ec04030d75f 100644
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
@@ -53,6 +53,23 @@ inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
   }
 }
 
+inline HcclDataType ToHCCLDataType(experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return HCCL_DATA_TYPE_FP32;
+  } else if (type == experimental::DataType::FLOAT16) {
+    return HCCL_DATA_TYPE_FP16;
+  } else if (type == experimental::DataType::INT64) {
+    return HCCL_DATA_TYPE_INT64;
+  } else if (type == experimental::DataType::INT32) {
+    return HCCL_DATA_TYPE_INT32;
+  } else if (type == experimental::DataType::INT8) {
+    return HCCL_DATA_TYPE_INT8;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in hccl is not supported."));
+  }
+}
+
 // NOTE(minqiyang): according to the ncclGroupEnd documentations:
 // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
 // ncclGroupEnd will wait for all communicators to be initialized, which will
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index e6b08ed7bc340b..3789ec322ac995 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -323,6 +323,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"split", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"squeeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
@@ -349,6 +351,8 @@ XPUOpMap& get_kl2_ops() {
                                   pOpKernelType(vartype::FP16, XPUPlace())})},
       {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tril_triu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace())})},
       {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace()),
                              pOpKernelType(vartype::BOOL, XPUPlace()),
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index b20e8ac9785caf..07385143362013 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -111,6 +111,22 @@ bool is_in_xpu_kpwhite_list(const std::string& op_name) {
 }
 #endif
 
+#ifdef PADDLE_WITH_XPU_KP
+std::vector<vartype::Type> get_xpu_kp_op_support_type(
+    const std::string& op_name, phi::backends::xpu::XPUVersion version) {
+  std::vector<vartype::Type> res;
+  auto& ops = version == phi::backends::xpu::XPUVersion::XPU1 ? get_kl1_ops()
+                                                              : get_kp_ops();
+  if (ops.find(op_name) != ops.end()) {
+    XPUKernelSet& type_set = ops[op_name];
+    for (auto& item : type_set) {
+      res.push_back(item.data_type_);
+    }
+  }
+  return res;
+}
+#endif
+
 std::vector<vartype::Type> get_xpu_op_support_type(
     const std::string& op_name, phi::backends::xpu::XPUVersion version) {
   std::vector<vartype::Type> res;
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index 455a38e36fe0ad..60926dd9a5660e 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -31,6 +31,8 @@ bool is_in_xpu_black_list(const std::string& op_name);
 bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type);
 bool is_in_xpu_kpwhite_list(const std::string& op_name);
+std::vector<vartype::Type> get_xpu_kp_op_support_type(
+    const std::string& op_name, phi::backends::xpu::XPUVersion version);
 #endif
 
 std::vector<vartype::Type> get_xpu_op_support_type(
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index f60cbc48694ff4..18ac979b48ef39 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -1,4 +1,6 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -465,6 +467,9 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
   InitCuBlasContext();
   InitCuDNNContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  InitCuBlasLtContext();
+#endif
   InitCuSparseContext();
   InitCuSolverContext();
 #endif
@@ -476,6 +481,9 @@ void CUDAContext::SetStream(gpuStream_t stream) {
     DestoryCuDNNContext();
     DestoryCuBlasContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+    DestoryCuBlasLtContext();
+#endif
     DestoryCuSolverContext();
 #endif
 
@@ -485,6 +493,9 @@ void CUDAContext::SetStream(gpuStream_t stream) {
     InitCuBlasContext();
     InitCuDNNContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+    InitCuBlasLtContext();
+#endif
     InitCuSolverContext();
 #endif
   }
@@ -495,6 +506,9 @@ CUDAContext::~CUDAContext() {
   DestoryCuDNNContext();
   DestoryCuBlasContext();
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  InitCuBlasLtContext();
+#endif
   DestoryCuSparseContext();
   DestoryCuSolverContext();
 #endif
@@ -551,6 +565,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const {
   }
   return phi::GPUContext::cublas_handle();
 }
+#if CUDA_VERSION >= 11060
+cublasLtHandle_t CUDADeviceContext::cublaslt_handle() const {
+  if (thread_ctx_.count(this)) {
+    return context()->CublasLtHandle()->GetCublasLtHandle();
+  }
+  return phi::GPUContext::cublaslt_handle();
+}
+#endif
 cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
   if (thread_ctx_.count(this)) {
     return context()->CusparseHandle()->GetCusparseHandle();
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 29b6477b683741..e104170ca24954 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -1,4 +1,6 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -29,6 +31,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
@@ -332,6 +335,12 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  const std::unique_ptr<CublasLtHandleHolder>& CublasLtHandle() const {
+    return cublaslt_handle_;
+  }
+#endif
+
   const std::unique_ptr<CusparseHandleHolder>& CusparseHandle() const {
     return cusparse_handle_;
   }
@@ -348,6 +357,14 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  /*! \brief  Call cublasLt function safely. */
+  inline void CublasLtCall(
+      const std::function<void(blasLtHandle_t)>& callback) const {
+    cublaslt_handle_->Call(callback);
+  }
+#endif
+
   /*! \brief  Call cusparse function safely. */
   inline void CusparseCall(
       const std::function<void(phi::sparseHandle_t)>& callback) const {
@@ -394,6 +411,12 @@ class CUDAContext {
 #endif
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  void InitCuBlasLtContext() {
+    cublaslt_handle_.reset(new CublasLtHandleHolder());
+  }
+#endif
+
   void InitCuSparseContext() {
     cusparse_handle_.reset(new CusparseHandleHolder(RawStream()));
   }
@@ -472,6 +495,10 @@ class CUDAContext {
   }
 
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  void DestoryCuBlasLtContext() { cublaslt_handle_.reset(); }
+#endif
+
   void DestoryCuSparseContext() { cusparse_handle_.reset(); }
 #endif
 
@@ -497,6 +524,9 @@ class CUDAContext {
   std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
 #ifndef PADDLE_WITH_HIP
+#if CUDA_VERSION >= 11060
+  std::unique_ptr<CublasLtHandleHolder> cublaslt_handle_;
+#endif
   cusolverDnHandle_t cusolver_dn_handle_;
   std::unique_ptr<CusparseHandleHolder> cusparse_handle_;
 #endif
@@ -559,6 +589,7 @@ class CUDADeviceContext : public phi::GPUContext {
   rocblas_handle cublas_handle() const;
 #else
   cublasHandle_t cublas_handle() const;
+  cublasLtHandle_t cublaslt_handle() const;
   cusparseHandle_t cusparse_handle() const;
 #endif
 
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index cf85dede8e8465..293a71dbd968c6 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -154,8 +154,8 @@ void LoadCustomDevice(const std::string &library_dir) {
             "Fail to open library: %s with error: %s", lib_path, dlerror()));
 
     phi::LoadCustomRuntimeLib(lib_path, dso_handle);
-    phi::LoadCustomKernelLib(lib_path, dso_handle);
   }
+  phi::CustomKernelMap::Instance().RegisterCustomKernels();
   LOG(INFO) << "Finished in LoadCustomDevice with libs_path: [" << library_dir
             << "]";
 }
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 866bf3c66aa2a0..feb72bce72bf8c 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -489,6 +489,10 @@ void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }
 
 void EnableHostEventRecorder() { FLAGS_enable_host_event_recorder_hook = true; }
 
+void DisableHostEventRecorder() {
+  FLAGS_enable_host_event_recorder_hook = false;
+}
+
 std::string PrintHostEvents() {
   std::ostringstream oss;
   auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents();
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 122e19b7c28080..78275341cbbf74 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -216,6 +216,7 @@ void NvprofEnableRecordEvent();
 void NvprofDisableRecordEvent();
 
 void EnableHostEventRecorder();
+void DisableHostEventRecorder();
 
 // Defined for UT
 std::string PrintHostEvents();
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 7fce0296d437a0..7148afee273fda 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -80,10 +80,10 @@ void StreamCallbackManager<Stream>::AddCallback(
 #endif
 
 #if PADDLE_WITH_MLU
-  VLOG(3) << "MLULaunchCallback at stream: " << stream_;
-  LOG(ERROR) << "failed to call MLULaunchCallback, "
-             << "because mlu not support StreamAddCallback yet. "
-             << "function: " << func;
+  VLOG(3) << "MLULaunchCallback at stream: " << stream_
+          << " Failed to call MLULaunchCallback, "
+          << "because mlu not support StreamAddCallback yet. "
+          << "function: " << func;
 #endif
 }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 5e61133510d6a2..f40cd51a7b286a 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,7 +2,7 @@ set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
-  cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store)
+  cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store new_profiler)
 
 if (WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -88,6 +88,9 @@ if(NOT ON_INFER)
   if (WITH_GLOO)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
   endif()
+  if(WITH_ASCEND)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
+  endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
 endif()
 
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index c01accaf598aa8..1a6a395545a96b 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -30,18 +30,42 @@ namespace pybind {
 
 using TCPStore = paddle::distributed::TCPStore;
 
-void BindTCPStore(py::module* m) {
-  py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore")
+void BindTCPStore(py::module *m) {
+  auto Store =
+      py::class_<distributed::Store, std::shared_ptr<distributed::Store>>(
+          *m, "Store")
+          .def(py::init<>())
+          .def("set",
+               [](distributed::Store &self, const std::string &key,
+                  const std::string &value) {
+                 std::vector<uint8_t> data(value.begin(), value.end());
+                 self.set(key, data);
+               },
+               py::arg("key"), py::arg("value"),
+               py::call_guard<py::gil_scoped_release>())
+          .def("get",
+               [](distributed::Store &self,
+                  const std::string &key) -> py::bytes {
+                 auto data = self.get(key);
+                 return py::bytes(reinterpret_cast<char *>(data.data()),
+                                  data.size());
+               },
+               py::arg("key"), py::call_guard<py::gil_scoped_release>())
+          .def("add", &distributed::Store::add,
+               py::call_guard<py::gil_scoped_release>())
+          .def("wait", &distributed::Store::wait,
+               py::call_guard<py::gil_scoped_release>());
+
+  py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore", Store)
       .def(py::init([](std::string hostname, uint16_t port, bool is_master,
                        size_t world_size, std::chrono::seconds timeout) {
              return std::make_shared<TCPStore>(hostname, port, is_master,
                                                world_size, timeout);
            }),
            py::arg("hostname"), py::arg("port"), py::arg("is_master"),
-           py::arg("world_size"), py::arg("timeout"),
-           py::call_guard<py::gil_scoped_release>())
-      .def("add", &TCPStore::add)
-      .def("get", &TCPStore::get);
+           py::arg("world_size"),
+           py::arg("timeout") = distributed::tcputils::kNoTimeout,
+           py::call_guard<py::gil_scoped_release>());
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/custom_handwrite_op_funcs.h b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
new file mode 100644
index 00000000000000..7a276df0d5bdc9
--- /dev/null
+++ b/paddle/fluid/pybind/custom_handwrite_op_funcs.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <iostream>
+
+static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
+                                       PyObject *kwargs) {
+  PyThreadState *tstate = nullptr;
+  try {
+    auto X = GetTensorListFromArgs("run_program", "X", args, 0, false);
+    auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
+    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, false);
+    auto OutScope =
+        GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
+    auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
+    framework::AttributeMap attrs;
+    ConstructAttrMapFromPyArgs("run_program", args, 5, PyTuple_GET_SIZE(args),
+                               attrs);
+
+    tstate = PyEval_SaveThread();
+    run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
+    std::cout << "end run_program_dygraph_function" << std::endl;
+    PyEval_RestoreThread(tstate);
+    tstate = nullptr;
+  } catch (...) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    ThrowExceptionToPython(std::current_exception());
+  }
+  Py_RETURN_NONE;
+}
+
+static PyMethodDef CustomEagerFinalStateMethods[] = {
+    {"run_program", (PyCFunction)(void (*)(void))eager_api_run_program,
+     METH_VARARGS | METH_KEYWORDS,
+     "C++ interface function for run_program in dygraph."},
+
+    {nullptr, nullptr, 0, nullptr}};
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 17512863357d8d..0b1796703817c2 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -35,6 +35,10 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#endif
+
 #if defined(PADDLE_WITH_GLOO)
 #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
 #include "paddle/fluid/distributed/store/tcp_store.h"
@@ -197,7 +201,15 @@ void BindDistributed(py::module *m) {
   py::class_<distributed::ProcessGroupNCCL,
              std::shared_ptr<distributed::ProcessGroupNCCL>>(
       *m, "ProcessGroupNCCL", ProcessGroup)
-      .def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
+      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int>(),
+           py::call_guard<py::gil_scoped_release>());
+#endif
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+  py::class_<distributed::ProcessGroupHCCL,
+             std::shared_ptr<distributed::ProcessGroupHCCL>>(
+      *m, "ProcessGroupHCCL", ProcessGroup)
+      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int>(),
            py::call_guard<py::gil_scoped_release>());
 #endif
 
@@ -210,44 +222,6 @@ void BindDistributed(py::module *m) {
       .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
            py::call_guard<py::gil_scoped_release>());
 
-  // define parallel strategy, it will be removed
-  py::class_<distributed::ProcessGroupStrategy> pg_strategy(
-      *m, "ProcessGroupStrategy", "");
-  pg_strategy.def(py::init())
-      .def_property("nranks",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.nranks_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self, int nranks) {
-                      self.nranks_ = nranks;
-                    })
-      .def_property("local_rank",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.local_rank_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self,
-                       int local_rank) { self.local_rank_ = local_rank; })
-      .def_property(
-          "trainer_endpoints",
-          [](const distributed::ProcessGroupStrategy &self) {
-            return self.trainer_endpoints_;
-          },
-          [](distributed::ProcessGroupStrategy &self,
-             std::vector<std::string> eps) { self.trainer_endpoints_ = eps; })
-      .def_property("current_endpoint",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.current_endpoint_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self,
-                       const std::string &ep) { self.current_endpoint_ = ep; })
-      .def_property("nrings",
-                    [](const distributed::ProcessGroupStrategy &self) {
-                      return self.nrings_;
-                    },
-                    [](distributed::ProcessGroupStrategy &self, int nrings) {
-                      self.nrings_ = nrings;
-                    });
-
 #if defined(PADDLE_WITH_GLOO)
   py::class_<GlooOptions>(*m, "GlooOptions")
       .def(py::init<>())
@@ -279,9 +253,7 @@ void BindDistributed(py::module *m) {
              return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
                                                        opts);
            }),
-           py::arg("store"), py::arg("rank"),
-           py::arg("world_size"),  // py::arg("timeout") =
-                                   // kProcessGroupDefaultTimeout,
+           py::arg("store"), py::arg("rank"), py::arg("world_size"),
            py::call_guard<py::gil_scoped_release>())
       .def_static("create_default_device",
                   &ProcessGroupGloo::createDefaultDevice);
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index c15c171799f442..102cdbb91ab066 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -17,6 +17,7 @@
 #include <iostream>
 #include <set>
 #include <string>
+#include <unordered_set>
 #ifndef _WIN32
 #include <unistd.h>
 #endif
@@ -129,6 +130,12 @@ static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs)
 
 const char* PYBIND_ITEM_TEMPLATE = R"(  {"%s", (PyCFunction)(void(*)(void))%s, METH_VARARGS | METH_KEYWORDS, "C++ interface function for %s in dygraph."},)";
 
+// These operators will skip automatical code generatrion and
+// need to be handwritten in CUSTOM_HANDWRITE_OP_FUNC_FILE
+std::unordered_set<std::string> CUSTOM_HANDWRITE_OPS_SET = {"run_program"};
+const char* CUSTOM_HANDWRITE_OP_FUNC_FILE =
+  "#include \"paddle/fluid/pybind/custom_handwrite_op_funcs.h\"\n";
+
 // clang-format on
 static inline bool FindInsMap(const std::string& op_type,
                               const std::string& in_name) {
@@ -355,7 +362,7 @@ GenerateOpFunctions() {
 
   std::vector<std::string> op_function_list, bind_function_list;
   auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
-
+  bool append_custom_head_file = false;
   for (auto& pair : op_info_map) {
     auto& op_info = pair.second;
     auto op_proto = op_info.proto_;
@@ -363,7 +370,12 @@ GenerateOpFunctions() {
       continue;
     }
     auto& op_type = op_proto->type();
-    // Skip ooerator which is not inherit form OperatorWithKernel, like while,
+    // Skip operators that will be handwriten in CUSTOM_HANDWRITE_OP_FUNC_FILE.
+    if (CUSTOM_HANDWRITE_OPS_SET.count(op_type)) {
+      append_custom_head_file = true;
+      continue;
+    }
+    // Skip operator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
     // if the phi lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
@@ -380,6 +392,9 @@ GenerateOpFunctions() {
     op_function_list.emplace_back(std::move(op_function_str));
     bind_function_list.emplace_back(std::move(bind_function_str));
   }
+  if (append_custom_head_file) {
+    op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE);
+  }
   return std::make_tuple(op_function_list, bind_function_list);
 }
 
@@ -449,6 +464,11 @@ int main(int argc, char* argv[]) {
       << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
          "core.eager.ops failed!\"));\n"
       << "  }\n\n"
+      << "  if (PyModule_AddFunctions(m.ptr(), CustomEagerFinalStateMethods) < "
+         "0) {\n"
+      << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
+         "core.eager.ops failed!\"));\n"
+      << "  }\n\n"
       << "}\n\n"
       << "} // namespace pybind\n"
       << "} // namespace paddle\n";
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 0cfb08345b6977..f4e148cf8dceb5 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/operators/py_func_op.h"
@@ -35,6 +36,7 @@ namespace pybind {
 
 extern PyTypeObject* p_tensor_type;
 
+extern PyTypeObject* g_framework_scope_pytype;
 extern PyTypeObject* g_vartype_pytype;
 extern PyTypeObject* g_place_pytype;
 extern PyTypeObject* g_cudaplace_pytype;
@@ -830,6 +832,64 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray(
   return paddle::experimental::ScalarArray({1});
 }
 
+paddle::framework::Scope* CastPyArg2ScopePtr(PyObject* obj) {
+  if (PyObject_IsInstance(
+          obj, reinterpret_cast<PyObject*>(g_framework_scope_pytype))) {
+    return ::pybind11::handle(obj).cast<paddle::framework::Scope*>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "PyObject can not be cast into framework::Scope"));
+  }
+}
+
+std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
+  if (list == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "None",
+          op_type, arg_name, arg_idx));
+    }
+  }
+
+  std::vector<paddle::framework::Scope*> result;
+  if (PyList_Check(list)) {
+    Py_ssize_t len = PyList_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(CastPyArg2ScopePtr(PyList_GetItem(list, i)));
+    }
+  } else if (PyTuple_Check(list)) {
+    Py_ssize_t len = PyTuple_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of scope, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(CastPyArg2ScopePtr(PyList_GetItem(list, i)));
+    }
+  } else if (list == Py_None) {
+    return {};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        (reinterpret_cast<PyTypeObject*>(list->ob_type))->tp_name));
+  }
+  return result;
+}
+
 paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
                                                 const std::string& op_type,
                                                 ssize_t arg_pos) {
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index c5da1bb37af733..966a920377b38f 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -20,6 +20,10 @@ limitations under the License. */
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 namespace paddle {
+namespace framework {
+class Scope;
+}
+
 namespace pybind {
 
 typedef struct {
@@ -134,6 +138,9 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     ssize_t arg_idx, bool dispensable = false);
 
 // end of Slice related methods
+std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable);
 
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 3da17b95a66ba8..9b373a58181f16 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -56,6 +56,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 #include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace paddle {
 namespace pybind {
@@ -2073,6 +2074,26 @@ void BindImperative(py::module *m_ptr) {
                  *(imperative::AmpOperators::Instance().GetMutableAllowOps()),
                  *(imperative::AmpOperators::Instance().GetMutableBlockOps()));
            })
+      .def("_get_kernel_signature",
+           [](imperative::Tracer &self, const std::string &type,
+              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs) {
+             // TODO(xiongkun): move this function outside of tracer.
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               auto to_vector = [](paddle::SmallVector<std::string> &vec) {
+                 return std::vector<std::string>(vec.begin(), vec.end());
+               };
+               auto ret = self.GetExpectedKernelSignature(type, ins_map,
+                                                          outs_map, attrs);
+               auto kernelsig_ins = to_vector(std::get<0>(ret.args));
+               auto kernelsig_attrs = to_vector(std::get<1>(ret.args));
+               auto kernelsig_outs = to_vector(std::get<2>(ret.args));
+               return std::make_tuple(kernelsig_ins, kernelsig_attrs,
+                                      kernelsig_outs);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
index f0d5a4e477fe47..8d78adaf5a4735 100644
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ b/paddle/fluid/pybind/kernel_signature_generator.cc
@@ -44,35 +44,41 @@ int main(int argc, char **argv) {
   paddle::framework::InitDefaultKernelSignatureMap();
   auto &kernel_signature_map = phi::DefaultKernelSignatureMap::Instance();
   auto &kernel_factory = phi::KernelFactory::Instance();
-  std::cout << "{";
+  std::string kernel_signature_map_str{"{"};
   for (const auto &op_kernel_pair : kernel_factory.kernels()) {
     if (kernel_signature_map.Has(op_kernel_pair.first)) {
-      std::cout << "\"" << op_kernel_pair.first << "\":{";
+      kernel_signature_map_str =
+          kernel_signature_map_str + "\"" + op_kernel_pair.first + "\":{";
       auto &args = kernel_signature_map.Get(op_kernel_pair.first).args;
 
-      std::cout << "\"inputs\":[";
+      kernel_signature_map_str += "\"inputs\":[";
       auto inputs_ = std::get<0>(args);
-      if (inputs_.size() > 0) std::cout << inputs_[0];
-      for (size_t i = 1; i < inputs_.size(); i++) {
-        std::cout << ",\"" << inputs_[i] << "\"";
+      for (size_t i = 0; i < inputs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + inputs_[i] + "\",";
       }
+      if (inputs_.size()) kernel_signature_map_str.pop_back();
 
-      std::cout << "],\"attrs\":[";
+      kernel_signature_map_str += "],\"attrs\":[";
       auto attrs_ = std::get<1>(args);
-      if (attrs_.size() > 0) std::cout << attrs_[0];
-      for (size_t i = 1; i < attrs_.size(); i++) {
-        std::cout << ",\"" << attrs_[i] << "\"";
+      for (size_t i = 0; i < attrs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + attrs_[i] + "\",";
       }
-
-      std::cout << "],\"outputs\":[";
+      if (attrs_.size()) kernel_signature_map_str.pop_back();
+      kernel_signature_map_str += "],\"outputs\":[";
       auto outputs_ = std::get<2>(args);
-      for (size_t i = 1; i < outputs_.size(); i++) {
-        std::cout << ",\"" << outputs_[i] << "\"";
+      for (size_t i = 0; i < outputs_.size(); i++) {
+        kernel_signature_map_str =
+            kernel_signature_map_str + "\"" + outputs_[i] + "\",";
       }
 
-      std::cout << "]},";
+      if (outputs_.size()) kernel_signature_map_str.pop_back();
+      kernel_signature_map_str += "]},";
     }
   }
-  std::cout << "}" << std::endl;
+  kernel_signature_map_str.pop_back();
+  kernel_signature_map_str += "}\n";
+  std::cout << kernel_signature_map_str;
   return 0;
 }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c016321ef802a1..1c5b30fe087f36 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -77,6 +78,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #include "paddle/fluid/pybind/distributed_py.h"
 #include "paddle/fluid/pybind/eager.h"
@@ -171,6 +175,7 @@ namespace paddle {
 namespace pybind {
 
 PyTypeObject *g_place_pytype = nullptr;
+PyTypeObject *g_framework_scope_pytype = nullptr;
 PyTypeObject *g_cudaplace_pytype = nullptr;
 PyTypeObject *g_cpuplace_pytype = nullptr;
 PyTypeObject *g_xpuplace_pytype = nullptr;
@@ -1348,7 +1353,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   BindReader(&m);
 
-  py::class_<Scope>(m, "_Scope", R"DOC(
+  py::class_<Scope> _Scope(m, "_Scope", R"DOC(
     Scope is an association of a name to Variable. All variables belong to Scope.
 
     Variables in a parent scope can be retrieved from local scope.
@@ -1368,7 +1373,9 @@ All parameter, weight, gradient are variables in Paddle.
           param_array = np.full((height, row_numel), 5.0).astype("float32")
           param.set(param_array, place)
 
-        )DOC")
+        )DOC");
+  g_framework_scope_pytype = reinterpret_cast<PyTypeObject *>(_Scope.ptr());
+  _Scope
       .def("_remove_from_pool",
            [](Scope &self) { ScopePool::Instance().Remove(&self); })
       .def("var",
@@ -1950,10 +1957,17 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
   m.def("get_xpu_device_version",
         [](int device_id) { return platform::get_xpu_version(device_id); });
+#ifdef PADDLE_WITH_XPU_KP
+  m.def("get_xpu_device_op_support_types",
+        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
+          return platform::get_xpu_kp_op_support_type(op_name, version);
+        });
+#else
   m.def("get_xpu_device_op_support_types",
         [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
           return platform::get_xpu_op_support_type(op_name, version);
         });
+#endif
   m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) {
     return platform::get_xpu_op_list(version);
   });
@@ -2912,6 +2926,88 @@ All parameter, weight, gradient are variables in Paddle.
   });
 
   m.def("size_of_dtype", framework::SizeOfType);
+  py::class_<paddle::platform::ProfilerResult>(m, "_ProfilerResult")
+      .def(py::init<>())
+      .def("get_data", &paddle::platform::ProfilerResult::GetData,
+           py::return_value_policy::automatic_reference)
+      .def("save", &paddle::platform::ProfilerResult::Save)
+      .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo);
+
+  py::class_<paddle::platform::DevicePythonNode>(m, "DevicePythonNode")
+      .def(py::init<>())
+      .def_readwrite("name", &paddle::platform::DevicePythonNode::name)
+      .def_readwrite("type", &paddle::platform::DevicePythonNode::type)
+      .def_readwrite("start_ns", &paddle::platform::DevicePythonNode::start_ns)
+      .def_readwrite("end_ns", &paddle::platform::DevicePythonNode::end_ns)
+      .def_readwrite("device_id",
+                     &paddle::platform::DevicePythonNode::device_id)
+      .def_readwrite("context_id",
+                     &paddle::platform::DevicePythonNode::context_id)
+      .def_readwrite("stream_id",
+                     &paddle::platform::DevicePythonNode::stream_id);
+
+  py::class_<paddle::platform::HostPythonNode>(m, "HostPythonNode")
+      .def(py::init<>())
+      .def_readwrite("name", &paddle::platform::HostPythonNode::name)
+      .def_readwrite("type", &paddle::platform::HostPythonNode::type)
+      .def_readwrite("start_ns", &paddle::platform::HostPythonNode::start_ns)
+      .def_readwrite("end_ns", &paddle::platform::HostPythonNode::end_ns)
+      .def_readwrite("process_id",
+                     &paddle::platform::HostPythonNode::process_id)
+      .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
+      .def_readwrite("children_node",
+                     &paddle::platform::HostPythonNode::children_node_ptrs)
+      .def_readwrite("runtime_node",
+                     &paddle::platform::HostPythonNode::runtime_node_ptrs)
+      .def_readwrite("device_node",
+                     &paddle::platform::HostPythonNode::device_node_ptrs);
+
+  py::class_<paddle::platform::Profiler>(m, "_Profiler")
+      .def("create", &paddle::platform::Profiler::Create,
+           py::return_value_policy::take_ownership)
+      .def("prepare",
+           [](paddle::platform::Profiler *profiler) {
+             platform::EnableHostEventRecorder();
+             profiler->Prepare();
+           })
+      .def("start", &paddle::platform::Profiler::Start)
+      .def("stop",
+           [](paddle::platform::Profiler *profiler) {
+             platform::DisableHostEventRecorder();
+             return profiler->Stop();
+           },
+           py::return_value_policy::automatic_reference);
+
+  py::class_<paddle::platform::ProfilerOptions>(m, "ProfilerOptions")
+      .def(py::init<>())
+      .def_readwrite("trace_switch",
+                     &paddle::platform::ProfilerOptions::trace_switch);
+
+  py::class_<platform::RecordEvent>(m, "_RecordEvent")
+      .def(py::init([](std::string name, platform::TracerEventType type) {
+        return std::make_unique<platform::RecordEvent>(
+            name, type, 1, paddle::platform::EventRole::kOrdinary);
+      }))
+      .def("end", [](platform::RecordEvent *event) { event->End(); });
+
+  py::enum_<paddle::platform::TracerEventType>(m, "TracerEventType")
+      .value("Operator", paddle::platform::TracerEventType::Operator)
+      .value("Dataloader", paddle::platform::TracerEventType::Dataloader)
+      .value("ProfileStep", paddle::platform::TracerEventType::ProfileStep)
+      .value("CudaRuntime", paddle::platform::TracerEventType::CudaRuntime)
+      .value("Kernel", paddle::platform::TracerEventType::Kernel)
+      .value("Memcpy", paddle::platform::TracerEventType::Memcpy)
+      .value("Memset", paddle::platform::TracerEventType::Memset)
+      .value("UserDefined", paddle::platform::TracerEventType::UserDefined)
+      .value("OperatorInner", paddle::platform::TracerEventType::OperatorInner)
+      .value("Forward", paddle::platform::TracerEventType::Forward)
+      .value("Backward", paddle::platform::TracerEventType::Backward)
+      .value("Optimization", paddle::platform::TracerEventType::Optimization)
+      .value("Communication", paddle::platform::TracerEventType::Communication)
+      .value("PythonOp", paddle::platform::TracerEventType::PythonOp)
+      .value("PythonUserDefined",
+             paddle::platform::TracerEventType::PythonUserDefined);
+  m.def("load_profiler_result", &paddle::platform::LoadProfilerResult);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
@@ -3440,6 +3536,31 @@ All parameter, weight, gradient are variables in Paddle.
                         build_strategy = static.BuildStrategy()
                         build_strategy.fuse_elewise_add_act_ops = True
                      )DOC")
+      .def_property(
+          "fuse_gemm_epilogue",
+          [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_gemm_epilogue_ = b;
+          },
+          R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
+                to fuse matmul_op, elemenewist_add_op and activation_op,
+                it may make the execution faster. Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_gemm_epilogue = True
+                     )DOC")
       .def_property(
           "fuse_bn_act_ops",
           [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index f2768f3dfa88d3..ed29b5b44c7791 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -95,9 +95,7 @@ set(infrt_mlir_incs
         dense_tensor_inc
         pd_ops_inc
         pd_extra_ops_inc
-        rewrite_inc
         trt_ops_inc
-        pd_lower_to_trt_inc
         )
 
 if (INFRT_WITH_PHI)
diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h
index 9d0e3bc4fbb315..5713fdbbaf82b2 100644
--- a/paddle/infrt/backends/host/phi_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace infrt {
@@ -20,6 +21,14 @@ class CpuPhiContext : public phi::CPUContext {
  public:
   using Base = phi::CPUContext;
   using phi::CPUContext::SetEigenDevice;
+
+  CpuPhiContext() {
+    Init();
+    SetAllocator(alloc_.get());
+  }
+
+ private:
+  std::unique_ptr<phi::Allocator> alloc_{std::make_unique<CpuPhiAllocator>()};
 };
 
 }  // namespace backends
diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
index 54b7bc3e8af835..12cf14060e27c1 100644
--- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -17,8 +17,8 @@
 #include <NvInfer.h>
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
-#include "gtest/gtest.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
@@ -86,7 +86,7 @@ TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
 inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); }
 
 TEST(trt, run_static) {
-  TRTEngine static_trt_engine(0);
+  TrtEngine static_trt_engine(0);
   auto net = ConstructNetwork(
       static_trt_engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true);
   BuildOptions static_build_options;
@@ -164,7 +164,7 @@ TEST(trt, run_static) {
 }
 
 TEST(trt, run_dynamic) {
-  TRTEngine engine(0);
+  TrtEngine engine(0);
   auto net = ConstructNetwork(
       engine.GetTrtBuilder(), nvinfer1::Dims4{-1, 3, -1, -1}, false);
   BuildOptions build_options;
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
index a204fe42b45080..232653e8c41f71 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -17,7 +17,7 @@
 
 #include <NvInferRuntime.h>
 #include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
+#include <glog/logging.h>
 #include "paddle/phi/backends/dynload/tensorrt.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/ddim.h"
@@ -40,26 +40,26 @@ static nvinfer1::IRuntime* createInferRuntime(
       phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
 }
 
-TRTEngine::TRTEngine(int device_id) : device_id_(device_id) {
+TrtEngine::TrtEngine(int device_id) : device_id_(device_id) {
   FreshDeviceId();
   logger_.reset(new TrtLogger());
   builder_.reset(createInferBuilder(logger_->GetTrtLogger()));
   phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), "");
 }
 
-nvinfer1::IBuilder* TRTEngine::GetTrtBuilder() {
+nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() {
   CHECK_NOTNULL(builder_);
   return builder_.get();
 }
 
-void TRTEngine::Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
+void TrtEngine::Build(TrtUniquePtr<nvinfer1::INetworkDefinition> network,
                       const BuildOptions& build_options) {
   FreshDeviceId();
   ModelToBuildEnv(std::move(network), build_options);
   CHECK_NOTNULL(engine_);
 }
 
-bool TRTEngine::ModelToBuildEnv(
+bool TrtEngine::ModelToBuildEnv(
     TrtUniquePtr<nvinfer1::INetworkDefinition> network,
     const BuildOptions& build) {
   CHECK_NOTNULL(builder_);
@@ -70,7 +70,7 @@ bool TRTEngine::ModelToBuildEnv(
   return true;
 }
 
-bool TRTEngine::NetworkToEngine(const BuildOptions& build) {
+bool TrtEngine::NetworkToEngine(const BuildOptions& build) {
   TrtUniquePtr<IBuilderConfig> config{builder_->createBuilderConfig()};
   CHECK_NOTNULL(config);
   CHECK(SetupNetworkAndConfig(build, *network_, *config));
@@ -91,7 +91,7 @@ bool TRTEngine::NetworkToEngine(const BuildOptions& build) {
   return true;
 }
 
-bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build,
+bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
                                       INetworkDefinition& network,
                                       IBuilderConfig& config) {
   builder_->setMaxBatchSize(build.max_batch);
@@ -235,7 +235,7 @@ bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build,
   return true;
 }
 
-bool TRTEngine::SetUpInference(
+bool TrtEngine::SetUpInference(
     const InferenceOptions& inference,
     const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
     std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
@@ -261,7 +261,7 @@ bool TRTEngine::SetUpInference(
   return true;
 }
 
-void TRTEngine::Run(const phi::GPUContext& ctx) {
+void TrtEngine::Run(const phi::GPUContext& ctx) {
   if (is_dynamic_shape_) {
     DynamicRun(ctx);
   } else {
@@ -269,7 +269,7 @@ void TRTEngine::Run(const phi::GPUContext& ctx) {
   }
 }
 
-void TRTEngine::StaticRun(const phi::GPUContext& ctx) {
+void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
   const int num_bindings = engine_->getNbBindings();
   std::vector<void*> buffers(num_bindings, nullptr);
 
@@ -303,7 +303,7 @@ void TRTEngine::StaticRun(const phi::GPUContext& ctx) {
       runtime_batch, buffers.data(), ctx.stream(), nullptr);
 }
 
-void TRTEngine::DynamicRun(const phi::GPUContext& ctx) {
+void TrtEngine::DynamicRun(const phi::GPUContext& ctx) {
   const int num_bindings = engine_->getNbBindings();
   std::vector<void*> buffers(num_bindings, nullptr);
 
@@ -339,14 +339,14 @@ void TRTEngine::DynamicRun(const phi::GPUContext& ctx) {
   contexts_.front()->enqueueV2(buffers.data(), ctx.stream(), nullptr);
 }
 
-void TRTEngine::FreshDeviceId() {
+void TrtEngine::FreshDeviceId() {
   int count;
   cudaGetDeviceCount(&count);
   CHECK_LT(device_id_, count);
   phi::backends::gpu::SetDeviceId(device_id_);
 }
 
-void TRTEngine::GetEngineInfo() {
+void TrtEngine::GetEngineInfo() {
 #if IS_TRT_VERSION_GE(8200)
   LOG(INFO) << "====== engine info ======";
   std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
index f72bdaf3ac0b46..3c8243e3c3838e 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -56,13 +56,18 @@ using namespace nvinfer1;  // NOLINT
 //
 // We have encapsulated this logic, please use the following programming model.
 //
-// TRTEngine trt_engine;
+// TrtEngine trt_engine;
 // trt_engine.Build(...);
 // trt_engine.SetUpInference(...);
 // trt_engine.Run(...);
-class TRTEngine {
+class TrtEngine {
  public:
-  explicit TRTEngine(int device_id);
+  explicit TrtEngine(int device_id = 0);
+
+  TrtEngine(const TrtEngine&) = delete;
+  TrtEngine& operator=(const TrtEngine&) = delete;
+  TrtEngine(TrtEngine&&) = default;
+  TrtEngine& operator=(TrtEngine&&) = default;
 
   nvinfer1::IBuilder* GetTrtBuilder();
 
diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h
index 4b129af1d53810..c66a850ffb1cc2 100644
--- a/paddle/infrt/backends/tensorrt/trt_utils.h
+++ b/paddle/infrt/backends/tensorrt/trt_utils.h
@@ -15,16 +15,17 @@
 
 #pragma once
 
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+#include <glog/logging.h>
+
 #include <algorithm>
 #include <cassert>
 #include <functional>
 #include <memory>
 #include <unordered_map>
 
-#include <NvInfer.h>
-#include <NvInferRuntime.h>
-#include <NvInferRuntimeCommon.h>
-#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt
index daf710e0baf545..08ce2d4707bfdc 100644
--- a/paddle/infrt/dialect/infrt/CMakeLists.txt
+++ b/paddle/infrt/dialect/infrt/CMakeLists.txt
@@ -13,3 +13,5 @@ mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt)
 mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt)
 add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen)
 add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen)
+
+add_subdirectory(pass)
diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td
index ecd7093e72b8ad..e07a598d9bc631 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops.td
@@ -18,6 +18,22 @@ def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> {
   let results = (outs Variadic<AnyType>);
 }
 
+def Infrt_ReturnOp : Infrt_Op<"return", [Terminator]> {
+  let summary = "host executor return operation";
+  let description = [{
+      The "infrt.return" operation represents a return operation within a function.
+
+        func @foo() : (i32, f8) {
+        infrt.return %0, %1 : i32, f8
+        }
+    }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let builders = [OpBuilder<(ins),
+                  [{ build($_builder, $_state, llvm::None); }]>];
+}
+
 def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> {
   let summary = "convert tensor type op";
   let description = [{convert tensor type op!}];
diff --git a/paddle/infrt/dialect/infrt/pass/CMakeLists.txt b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt
new file mode 100644
index 00000000000000..19c12251a2e6b4
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    infrt_op_fuse_pass.cc
+    )
+
+mlir_add_rewriter(infrt_op_fuse)
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
new file mode 100644
index 00000000000000..ef702650b6f1bb
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
@@ -0,0 +1,23 @@
+#ifndef INFRT_OP_FUSE
+#define INFRT_OP_FUSE
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "paddle/infrt/dialect/infrt/infrt_ops.td"
+include "paddle/infrt/dialect/pd_ops.td"
+
+def FuseCvtTensorPattern : Pat<
+       (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)),
+       (Infrt_CvtTensorOp $arg)>;
+
+def FuseFeedCvtTensorPattern : Pat<
+       (Infrt_CvtTensorOp (PD_FeedOp $name)),
+       (PD_FeedOp $name)>;
+
+def TypesAreIdentical : Constraint<CPred<"$0.getType() == $1.getType()">>;
+def RedundantCvtTensorOptPattern : Pat<
+  (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg),
+  [(TypesAreIdentical $res, $arg)]>;
+
+
+
+#endif // INFRT_OP_FUSE
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
new file mode 100644
index 00000000000000..cb16e054418b3b
--- /dev/null
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+
+#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd_ops.h"
+namespace {
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc"  // NOLINT
+
+/*
+ * infrtOpFusePass.
+ */
+struct InfrtOpFusePass
+    : public mlir::PassWrapper<InfrtOpFusePass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "infrtOpFusePass"; }
+  void runOnFunction() override;
+};
+// Implementation of the InfrtOpFusePass.
+void InfrtOpFusePass::runOnFunction() {
+  ::mlir::RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+  // Fuse pd.return Operation
+  auto terminator_op = getFunction().front().getTerminator();
+  if (nullptr == terminator_op) return;
+  for (auto operand : terminator_op->getOperands()) {
+    auto *op1 = operand.getDefiningOp();
+    auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1);
+    if (!cvt_op) continue;
+    mlir::Value value = cvt_op.input();
+    operand.replaceAllUsesWith(value);
+    cvt_op.erase();
+  }
+}
+}  // namespace
+std::unique_ptr<mlir::Pass> infrt::createInfrtOpFusePass() {
+  return std::make_unique<InfrtOpFusePass>();
+}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h
similarity index 51%
rename from paddle/fluid/operators/reduce_ops/reduce_max_op.cu
rename to paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h
index 8194805ddc3736..ef349a7bbc4c65 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,13 +11,14 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-// reduce_max
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max,
-    ops::ReduceCudaKernel<float, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MaxFunctor, kps::IdentityFunctor>);
+#pragma once
+#include <mlir/Pass/Pass.h>
+
+namespace infrt {
+/*
+ * infrtOpFusePass.
+ */
+std::unique_ptr<mlir::Pass> createInfrtOpFusePass();
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
index 266bdf60de788d..26425e3945caa2 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -75,7 +75,7 @@ def PD_ElementType : Type<Or<[PD_Float.predicate,
 // def PD_Tensor : TensorOf<[PD_ElementType]>;
 def PD_Tensor1 : TensorOf<[PD_ElementType]>;
 
-def PD_Tensor :  AnyTypeOf<[PD_Tensor1, LoDTensor],"pd.ttype">;
+def PD_Tensor :  AnyTypeOf<[PD_Tensor1, LoDTensor, DenseTensor],"pd.ttype">;
 
 def PD_Tensor_Array : VectorOf<[PD_Tensor]>;
 
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
index 338b04e0013202..55ab174fcaf059 100644
--- a/paddle/infrt/dialect/pd_ops.cc
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -16,7 +16,6 @@
 
 #include <mlir/IR/Matchers.h>
 #include <mlir/IR/PatternMatch.h>
-#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt_base.h"
 
 #define GET_OP_CLASSES
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
index b48c68060d42ef..41dd2ddd94eb16 100644
--- a/paddle/infrt/dialect/pd_ops.h
+++ b/paddle/infrt/dialect/pd_ops.h
@@ -28,6 +28,7 @@
 #include <mlir/Interfaces/InferTypeOpInterface.h>
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 
 namespace mlir {
 namespace pd {
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
index a2677a946cb7e8..4e73a533d99a79 100644
--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -10,3 +10,6 @@ target_link_libraries(phi-ir-exec infrt)
 
 add_executable(phi-exec phi_exec.cc)
 target_link_libraries(phi-exec infrt)
+
+gather_srcs(infrt_src SRCS
+    data_type.cc)
diff --git a/paddle/infrt/dialect/phi/data_type.cc b/paddle/infrt/dialect/phi/data_type.cc
new file mode 100644
index 00000000000000..5da7ec8831258e
--- /dev/null
+++ b/paddle/infrt/dialect/phi/data_type.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/data_type.h"
+
+namespace infrt {
+
+phi::Backend cvtTarget2Phi(TargetType target) {
+  switch (target) {
+    case TargetType::CPU:
+      return phi::Backend::CPU;
+    case TargetType::GPU:
+      return phi::Backend::GPU;
+    default:
+      return phi::Backend::UNDEFINED;
+  }
+}
+
+TargetType cvtTargetFromPhi(phi::Backend backend) {
+  switch (backend) {
+    case phi::Backend::CPU:
+      return TargetType::CPU;
+    case phi::Backend::GPU:
+      return TargetType::GPU;
+    default:
+      return TargetType::UNK;
+  }
+}
+
+phi::DataType cvtPrecision2Phi(PrecisionType precision) {
+#define CONVERT_PRECISION_TO_PHI(Precision) \
+  case PrecisionType::Precision:            \
+    return phi::DataType::Precision;
+
+  switch (precision) {
+    CONVERT_PRECISION_TO_PHI(FLOAT32)
+    CONVERT_PRECISION_TO_PHI(FLOAT16)
+    CONVERT_PRECISION_TO_PHI(FLOAT64)
+    CONVERT_PRECISION_TO_PHI(UINT8)
+    CONVERT_PRECISION_TO_PHI(INT8)
+    CONVERT_PRECISION_TO_PHI(INT16)
+    CONVERT_PRECISION_TO_PHI(INT32)
+    CONVERT_PRECISION_TO_PHI(INT64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX128)
+    CONVERT_PRECISION_TO_PHI(BOOL)
+    default:
+      return phi::DataType::UNDEFINED;
+  }
+#undef CONVERT_PRECISION_TO_PHI
+}
+
+PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
+#define CONVERT_PRECISION_FROM_PHI(Precision) \
+  case phi::DataType::Precision:              \
+    return PrecisionType::Precision;
+
+  switch (datatype) {
+    CONVERT_PRECISION_FROM_PHI(FLOAT32)
+    CONVERT_PRECISION_FROM_PHI(FLOAT16)
+    CONVERT_PRECISION_FROM_PHI(FLOAT64)
+    CONVERT_PRECISION_FROM_PHI(UINT8)
+    CONVERT_PRECISION_FROM_PHI(INT8)
+    CONVERT_PRECISION_FROM_PHI(INT16)
+    CONVERT_PRECISION_FROM_PHI(INT32)
+    CONVERT_PRECISION_FROM_PHI(INT64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX128)
+    CONVERT_PRECISION_FROM_PHI(BOOL)
+    default:
+      return PrecisionType::UNK;
+  }
+#undef CONVERT_PRECISION_FROM_PHI
+}
+
+phi::DataLayout cvtLayout2Phi(LayoutType layout) {
+  switch (layout) {
+    case LayoutType::NCHW:
+      return phi::DataLayout::NCHW;
+    case LayoutType::NHWC:
+      return phi::DataLayout::NHWC;
+    case LayoutType::ANY:
+      return phi::DataLayout::ANY;
+    default:
+      return phi::DataLayout::UNDEFINED;
+  }
+}
+
+LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
+  switch (layout) {
+    case phi::DataLayout::NCHW:
+      return LayoutType::NCHW;
+    case phi::DataLayout::NHWC:
+      return LayoutType::NHWC;
+    case phi::DataLayout::ANY:
+      return LayoutType::ANY;
+    default:
+      return LayoutType::UNK;
+  }
+}
+
+phi::KernelKey cvtPlace2Phi(const Place& place) {
+  return phi::KernelKey(cvtTarget2Phi(place.target),
+                        cvtLayout2Phi(place.layout),
+                        cvtPrecision2Phi(place.precision));
+}
+
+Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
+  return Place(cvtTargetFromPhi(tensor_arg.backend),
+               cvtPrecisionFromPhi(tensor_arg.dtype),
+               cvtLayoutFromPhi(tensor_arg.layout));
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/data_type.h b/paddle/infrt/dialect/phi/data_type.h
new file mode 100644
index 00000000000000..b618ef38613033
--- /dev/null
+++ b/paddle/infrt/dialect/phi/data_type.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/infrt/dialect/infrt/common_type.h"
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_factory.h"
+
+namespace infrt {
+
+phi::Backend cvtTarget2Phi(TargetType target);
+TargetType cvtTargetFromPhi(phi::Backend backend);
+
+phi::DataType cvtPrecision2Phi(PrecisionType precision);
+PrecisionType cvtPrecisionFromPhi(phi::DataType datatype);
+
+phi::DataLayout cvtLayout2Phi(LayoutType layout);
+LayoutType cvtLayoutFromPhi(phi::DataLayout layout);
+
+phi::KernelKey cvtPlace2Phi(const Place& place);
+Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg);
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index 3399c408d9b5a0..21c4669b645fb6 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -18,12 +18,13 @@ def PHI_DenseTensorDialect : Dialect {
 }
 
 // PHI DenseTensor related Op.
-class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
-}
+class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect,
+  mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
 
-class CreateDenseTensorOp<string place, string dtype, string layout> 
-      : PDT_Op<"create_dense_tensor." # place # "." # dtype # "." # layout, [NoSideEffect]> {
-  let arguments = (ins Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod);
+class CreateDenseTensorOp 
+      : PDT_Op<"create_dense_tensor", [NoSideEffect]> {
+  let arguments = (ins Context:$context, I64ArrayAttr:$dims, 
+    LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision);
   let results = (outs DenseTensor:$output);
 }
 
@@ -44,23 +45,16 @@ class PrintDenseTensorOp:
   let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
 }
 
-class CreateCPUAllocatorOp
-      : PDT_Op<"create_allocator." # "cpu", [NoSideEffect]> {
+class CreateContextOp<string target>
+      : PDT_Op<"create_context." # target, [NoSideEffect]> {
   let arguments = (ins);
-  let results = (outs Allocator:$output);
-}
-
-class CreateCPUContextOp
-      : PDT_Op<"create_context." # "cpu", [NoSideEffect]> {
-  let arguments = (ins Allocator:$input);
   let results = (outs Context:$output);
 }
 
-def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nchw">;
+def PDT_CreateDenseTensorOp : CreateDenseTensorOp;
 def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
-def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp;
-def PDT_CreateContextOp_cpu : CreateCPUContextOp;
-def PDT_PrintDenseTensor_cpu : PrintDenseTensorOp;
+def PDT_CreateCPUContextOp : CreateContextOp<"cpu">;
+def PDT_PrintDenseTensor : PrintDenseTensorOp;
 
 def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
   let arguments = (ins Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index 12a6cfcc3e4a81..d1763897b4a132 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -14,119 +14,10 @@
 
 #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include <glog/logging.h>
-#include "paddle/phi/core/kernel_factory.h"
-#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/infrt/dialect/phi/data_type.h"
 #include "paddle/phi/kernels/declarations.h"
 
 namespace infrt {
-namespace {
-phi::Backend cvtTarget2Phi(TargetType target) {
-  switch (target) {
-    case TargetType::CPU:
-      return phi::Backend::CPU;
-    case TargetType::GPU:
-      return phi::Backend::GPU;
-    default:
-      return phi::Backend::UNDEFINED;
-  }
-}
-
-TargetType cvtTargetFromPhi(phi::Backend backend) {
-  switch (backend) {
-    case phi::Backend::CPU:
-      return TargetType::CPU;
-    case phi::Backend::GPU:
-      return TargetType::GPU;
-    default:
-      return TargetType::UNK;
-  }
-}
-
-phi::DataType cvtPrecision2Phi(PrecisionType precision) {
-#define CONVERT_PRECISION_TO_PHI(Precision) \
-  case PrecisionType::Precision:            \
-    return phi::DataType::Precision;
-
-  switch (precision) {
-    CONVERT_PRECISION_TO_PHI(FLOAT32)
-    CONVERT_PRECISION_TO_PHI(FLOAT16)
-    CONVERT_PRECISION_TO_PHI(FLOAT64)
-    CONVERT_PRECISION_TO_PHI(UINT8)
-    CONVERT_PRECISION_TO_PHI(INT8)
-    CONVERT_PRECISION_TO_PHI(INT16)
-    CONVERT_PRECISION_TO_PHI(INT32)
-    CONVERT_PRECISION_TO_PHI(INT64)
-    CONVERT_PRECISION_TO_PHI(COMPLEX64)
-    CONVERT_PRECISION_TO_PHI(COMPLEX128)
-    CONVERT_PRECISION_TO_PHI(BOOL)
-    default:
-      return phi::DataType::UNDEFINED;
-  }
-#undef CONVERT_PRECISION_TO_PHI
-}
-
-PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
-#define CONVERT_PRECISION_FROM_PHI(Precision) \
-  case phi::DataType::Precision:              \
-    return PrecisionType::Precision;
-
-  switch (datatype) {
-    CONVERT_PRECISION_FROM_PHI(FLOAT32)
-    CONVERT_PRECISION_FROM_PHI(FLOAT16)
-    CONVERT_PRECISION_FROM_PHI(FLOAT64)
-    CONVERT_PRECISION_FROM_PHI(UINT8)
-    CONVERT_PRECISION_FROM_PHI(INT8)
-    CONVERT_PRECISION_FROM_PHI(INT16)
-    CONVERT_PRECISION_FROM_PHI(INT32)
-    CONVERT_PRECISION_FROM_PHI(INT64)
-    CONVERT_PRECISION_FROM_PHI(COMPLEX64)
-    CONVERT_PRECISION_FROM_PHI(COMPLEX128)
-    CONVERT_PRECISION_FROM_PHI(BOOL)
-    default:
-      return PrecisionType::UNK;
-  }
-#undef CONVERT_PRECISION_FROM_PHI
-}
-
-phi::DataLayout cvtLayout2Phi(LayoutType layout) {
-  switch (layout) {
-    case LayoutType::NCHW:
-      return phi::DataLayout::NCHW;
-    case LayoutType::NHWC:
-      return phi::DataLayout::NHWC;
-    case LayoutType::ANY:
-      return phi::DataLayout::ANY;
-    default:
-      return phi::DataLayout::UNDEFINED;
-  }
-}
-
-LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
-  switch (layout) {
-    case phi::DataLayout::NCHW:
-      return LayoutType::NCHW;
-    case phi::DataLayout::NHWC:
-      return LayoutType::NHWC;
-    case phi::DataLayout::ANY:
-      return LayoutType::ANY;
-    default:
-      return LayoutType::UNK;
-  }
-}
-
-phi::KernelKey cvtPlace2Phi(const Place& place) {
-  return phi::KernelKey(cvtTarget2Phi(place.target),
-                        cvtLayout2Phi(place.layout),
-                        cvtPrecision2Phi(place.precision));
-}
-
-Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) {
-  return Place(cvtTargetFromPhi(tensor_arg.backend),
-               cvtPrecisionFromPhi(tensor_arg.dtype),
-               cvtLayoutFromPhi(tensor_arg.layout));
-}
-
-}  // namespace
 
 std::string getPhiTargetPrefix(TargetType target) {
   switch (target) {
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
index 376ab31938a97b..fb00a3de3fc0c8 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
@@ -125,10 +125,8 @@ void phiOpCvtPass::diapatchStage() {
 
     kernel_name = getPhiTargetPrefix(phi_kernel_desc.kernelType.target) +
                   kernel_name +
-                  getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout) +
-                  getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision);
-
-    // mlir::OperationName operation_name = kernel_op.getOperation()->getName();
+                  getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) +
+                  getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout);
 
     mlir::OperationName operation_name(kernel_name, kernel_op.getContext());
     mlir::OperationState operation_state(kernel_op.getLoc(), operation_name);
@@ -137,20 +135,12 @@ void phiOpCvtPass::diapatchStage() {
         phi_context.end()) {
       switch (phi_kernel_desc.kernelType.target) {
         case TargetType::CPU: {
-          auto alloctor_value =
-              builder
-                  .create<infrt::phi::CreateAllocatorOp_cpu>(
-                      kernel_op.getLoc(),
-                      phi::AllocatorType::get(kernel_op.getContext(),
-                                              TargetType::CPU))
-                  .output();
           auto context_value =
               builder
-                  .create<infrt::phi::CreateContextOp_cpu>(
+                  .create<infrt::phi::CreateCPUContextOp>(
                       kernel_op.getLoc(),
                       phi::ContextType::get(kernel_op.getContext(),
-                                            TargetType::CPU),
-                      alloctor_value)
+                                            TargetType::CPU))
                   .output();
           phi_context[TargetType::CPU] = context_value;
         } break;
diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc
index 1df929895b1c70..559fb90a64a786 100644
--- a/paddle/infrt/dialect/phi/phi_ir_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc
@@ -16,6 +16,7 @@
 #include <iostream>
 #include <string>
 #include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
 
@@ -38,6 +39,7 @@ int main(int argc, char** argv) {
                                              infrt::PrecisionType::FLOAT32,
                                              infrt::LayoutType::NCHW}};
   phi_pass_manager.addPass(std::make_unique<infrt::phiOpCvtPass>(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
   if (mlir::failed(pm.run(*module))) {
     std::cout << "\npass failed!\n" << std::endl;
     return 4;
diff --git a/paddle/infrt/dialect/tensorrt/trt_dialect_types.h b/paddle/infrt/dialect/tensorrt/trt_dialect_types.h
new file mode 100644
index 00000000000000..0c3edcec1edb2d
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_dialect_types.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mlir/IR/Types.h"
+
+namespace infrt {
+namespace trt {
+
+class EngineType
+    : public mlir::Type::TypeBase<EngineType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static EngineType get();
+  static EngineType get(mlir::MLIRContext *context);
+};
+
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index fa0095363c5fd3..ad6b136463a71d 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -53,9 +53,9 @@ bool reverseDfs(std::vector<mlir::Operation *> source,
 }
 
 // merge the first&second graph op to a new graph op.
-void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
-                                    CreateEngineOp first,
-                                    CreateEngineOp second) {
+void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
+                             mlir::pd::GraphOp first,
+                             mlir::pd::GraphOp second) {
   // comput inputs and outputs
   ::llvm::SmallVector<mlir::Value, 4> inputs(first.getOperands()), outputs;
   for (mlir::Value input : second.getOperands()) {
@@ -84,8 +84,7 @@ void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
   // create the new graph op
   builder.setInsertionPoint(first);
   auto loc = first.getLoc();
-  auto graph_op =
-      builder.create<CreateEngineOp>(loc, return_types, inputs, true);
+  auto graph_op = builder.create<mlir::pd::GraphOp>(loc, return_types, inputs);
   mlir::Block *block = new mlir::Block;
   auto copy_range = second.getBody()->without_terminator();
   block->getOperations().splice(block->begin(),
@@ -98,7 +97,7 @@ void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder,  // NOLINT
                                 copy_range.begin(),
                                 copy_range.end());
   builder.setInsertionPointToEnd(block);
-  builder.create<::infrt::dialect::ReturnOp>(loc, outputs);
+  builder.create<::infrt::ReturnOp>(loc, outputs);
   graph_op.body().push_back(block);
 
   // mapping the output
@@ -150,12 +149,13 @@ void TRTGraphFusePass::runOnFunction() {
   do {
     changed = false;
     for (auto &op : body) {
-      CreateEngineOp graph_op = ::llvm::dyn_cast_or_null<CreateEngineOp>(&op);
+      mlir::pd::GraphOp graph_op =
+          ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
       if (nullptr == graph_op) continue;
 
       for (auto user_op : op.getUsers()) {
-        CreateEngineOp user_graph_op =
-            ::llvm::dyn_cast_or_null<CreateEngineOp>(user_op);
+        mlir::pd::GraphOp user_graph_op =
+            ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(user_op);
         if (nullptr == user_graph_op) continue;
         // get all dst input nodes except src.
         std::vector<mlir::Operation *> source_nodes;
@@ -168,7 +168,7 @@ void TRTGraphFusePass::runOnFunction() {
         // Reverse DFS from the source_nodes.
         if (!reverseDfs(source_nodes,
                         [&op](const mlir::Operation *n) { return n == &op; })) {
-          mergeTwoAdjacentCreateEngineOp(builder, graph_op, user_graph_op);
+          mergeTwoAdjacentGraphOp(builder, graph_op, user_graph_op);
           changed = true;
           break;
         }
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index 350add905aac75..803e53e3244f92 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <mlir/Pass/Pass.h>
 #include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -26,40 +25,37 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "Infrt.return" %m
+ *     "infrt.return" (%m)
  *  } ...
- *  %d = "trt.create_engine"(%c) {
+ *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "Infrt.return" %m
+ *      "infrt.return" (%m)
  *  } ...
- *  %f = "trt.create_engine"(%a) {
+ *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "Infrt.return" %m
+ *      "infrt.return" (%m)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "infrt.return" (%d, %f)..
+ * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" %n, %s
+ *     "infrt.return" (%n, %s)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "infrt.return" (%d, %f)
  * }
  */
 class TRTGraphFusePass
     : public mlir::PassWrapper<TRTGraphFusePass, mlir::FunctionPass> {
  public:
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index 5ee7b23213a010..e3a7b455024c65 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -16,23 +16,23 @@
 
 #include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
 void TRTGraphSplitPass::runOnFunction() {
-  std::vector<CreateEngineOp> worklist;
+  std::vector<mlir::pd::GraphOp> worklist;
   mlir::Block& block = getFunction().front();
   for (auto& op : block) {
-    CreateEngineOp graph_op = ::llvm::dyn_cast_or_null<CreateEngineOp>(&op);
+    mlir::pd::GraphOp graph_op =
+        ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
     if (nullptr != graph_op &&
         graph_op.getBody()->getOperations().size() <= min_subgraph_size_) {
       worklist.push_back(graph_op);
     }
   }
   while (!worklist.empty()) {
-    CreateEngineOp graph_op = worklist.back();
+    mlir::pd::GraphOp graph_op = worklist.back();
     worklist.pop_back();
     mlir::Block* body = graph_op.getBody();
     auto return_op = body->getTerminator();
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index 28078e2bc2dbff..1c44a13cf9dfb6 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <mlir/Pass/Pass.h>
 #include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -27,33 +26,29 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" (%n, %s)
+ *     "infrt.return" (%n, %s)...
  *  } ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return" (%d, %f)...
  * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return" (%d, %f)...
  * }
  */
 class TRTGraphSplitPass
     : public mlir::PassWrapper<TRTGraphSplitPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   void runOnFunction() override;
   explicit TRTGraphSplitPass(size_t min_subgraph_size = 3)
       : min_subgraph_size_(min_subgraph_size) {}
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_base.td b/paddle/infrt/dialect/tensorrt/trt_op_base.td
index 5722f17d597870..128960ee03e030 100755
--- a/paddle/infrt/dialect/tensorrt/trt_op_base.td
+++ b/paddle/infrt/dialect/tensorrt/trt_op_base.td
@@ -27,6 +27,9 @@ class TRT_PaddleAttr <string name, string description> :
       Attr<CPred<"$_self.isa<mlir::trt::" # name # "Attr>()">,
           "PaddlePaddle " # description # " attribute">;
 
+def TRT_EngineType :
+      Type<CPred<"$_self.isa<::infrt::trt::EngineType>()">, "!trt.engine">,
+      BuildableType<"getType<::infrt::trt::EngineType>()">;
 
 //===----------------------------------------------------------------------===//
 // PaddlePaddle type definitions
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
index 8d81e739d9c72e..1be5f4dbc39d76 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -16,12 +16,64 @@
 #include <mlir/Transforms/DialectConversion.h>
 #include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
 namespace trt {
 
 #include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc"  // NOLINT
 
+struct PD2TRT_GraphLower : public ::mlir::RewritePattern {
+  PD2TRT_GraphLower(::mlir::MLIRContext *context)
+      : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {}
+  ::mlir::LogicalResult matchAndRewrite(
+      ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override {
+    auto casted_op = ::llvm::dyn_cast<mlir::pd::GraphOp>(op);
+    ::mlir::Operation::operand_range inputs = casted_op.inputs();
+    auto ods_loc = rewriter.getFusedLoc(op->getLoc());
+    CreateEngineOp create_engine_op;
+    // inputs
+    ::mlir::SmallVector<::mlir::Value, 4> trt_inputs;
+    for (auto v : inputs) {
+      trt_inputs.push_back(v);
+    }
+    create_engine_op = rewriter.create<CreateEngineOp>(
+        ods_loc,
+        ::llvm::SmallVector<mlir::Type, 4>(1, EngineType::get()),
+        trt_inputs,
+        true /*run_once*/);
+    ::mlir::Block *block = new ::mlir::Block;
+    block->getOperations().splice(block->begin(),
+                                  casted_op.getBody()->getOperations(),
+                                  casted_op.getBody()->begin(),
+                                  casted_op.getBody()->end());
+    create_engine_op.body().push_back(block);
+
+    // trt.execute
+    // outputs
+    ::llvm::SmallVector<::mlir::Type, 4> execute_outputs_types;
+    for (auto v : casted_op.getODSResults(0)) {
+      execute_outputs_types.push_back(v.getType());
+    }
+    // inputs
+    ::mlir::SmallVector<::mlir::Value, 4> execute_inputs(
+        create_engine_op.getODSResults(0));
+    for (auto v : inputs) {
+      execute_inputs.push_back(v);
+    }
+    auto execute_op = rewriter.create<ExecuteOp>(
+        ods_loc, execute_outputs_types, execute_inputs);
+
+    ::llvm::SmallVector<::mlir::Value, 4> replace_values;
+    for (auto v :
+         ::llvm::SmallVector<::mlir::Value, 4>{execute_op.getODSResults(0)}) {
+      replace_values.push_back(v);
+    }
+    rewriter.replaceOp(op, replace_values);
+    return ::mlir::success();
+  }
+};
+
 void TRTOpConverterPass::runOnOperation() {
   // The first thing to define is the conversion target. This will define the
   // final target for this lowering.
@@ -36,6 +88,7 @@ void TRTOpConverterPass::runOnOperation() {
   // the set of patterns that will lower the TensorRT operations.
   ::mlir::RewritePatternSet patterns(&getContext());
   populateWithGenerated(patterns);
+  patterns.add<PD2TRT_GraphLower>(&getContext());
 
   // With the target and rewrite patterns defined, we can now attempt the
   // conversion. The conversion will signal failure if any of our `illegal`
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
index a8128a585ee82d..7550d8c84e1950 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 #include "mlir/IR/Dialect.h"
 #include "mlir/Pass/Pass.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
@@ -23,27 +24,26 @@ namespace trt {
  * trtOpConverterPass.
  *
  * source ir:
- * func @main() -> tensor<?xf32> {
- *   %a = "pd.feed"()...
- *   %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *   %d, %f = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "Infrt.return" %n, %s
+ *     "infrt.return" (%n, %s)...
  *   } ...
- *   "pd.fetch" %d, %f
+ *   "infrt.return" (%d, %f)...
  * }
  *
  * destination ir:
- * func @main() -> tensor<?xf32> {
- *   %a = "pd.feed"()...
- *   %d, %f = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *   %engine = "trt.create_engine"(%a) ({
  *     %m = "trt.Convolution"(%a)...
  *     %n = "trt.Convolution"(%m)...
  *     %s = "trt.Convolution"(%a)...
- *     "Infrt.return" %n, %s
- *   } ...
- *   "pd.fetch" %d, %f
+ *     "infrt.return" (%n, %s)...
+ *   }){run_once = true} ...
+ *   %d, %f = "trt.execute"(%engine, %a)...
+ *   "infrt.return" (%d, %f)...
  * }
  */
 struct TRTOpConverterPass
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 17e893a383a9cd..13b7f1aee55d2a 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -16,6 +16,7 @@
 
 #include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/pd_ops.h"
 
 namespace infrt {
@@ -37,11 +38,11 @@ void TRTOpTellerPass::runOnFunction() {
     if (::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op)) continue;
     if (::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op)) continue;
     if (::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op)) continue;
-    if (::llvm::dyn_cast_or_null<CreateEngineOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue;
     builder.setInsertionPoint(op);
     auto loc = getFunction().getLoc();
-    auto graph_op = builder.create<CreateEngineOp>(
-        loc, op->getResultTypes(), op->getOperands(), true);
+    auto graph_op = builder.create<mlir::pd::GraphOp>(
+        loc, op->getResultTypes(), op->getOperands());
 
     ::llvm::SmallVector<mlir::Value, 4> tblgen_repl_values;
     for (auto v :
@@ -54,7 +55,7 @@ void TRTOpTellerPass::runOnFunction() {
     graph_op.body().push_back(block);
     op->moveBefore(block, block->begin());
     builder.setInsertionPointToEnd(block);
-    builder.create<::infrt::dialect::ReturnOp>(loc, op->getResults());
+    builder.create<::infrt::ReturnOp>(loc, op->getResults());
   }
 }
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index 471eafa9f9ba33..b9e461c8633d90 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <mlir/Pass/Pass.h>
 #include "paddle/infrt/dialect/infrt_base.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -26,30 +25,28 @@ namespace trt {
  *
  * source func:
  *
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return"(%d, %f) ...
  * }
  *
  * destination func:
- * func @main() -> tensor<?xf32> {
- *  %a = "pd.feed"()...
- *  %c = "trt.create_engine"(%a) {
+ * func @main(%a : tensor<?xf32>) -> tensor<?xf32> {
+ *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "Infrt.return" (%m)
+ *     "infrt.return" (%m)
  *  } ...
- *  %d = "trt.create_engine"(%c) {
+ *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "Infrt.return" (%m)
+ *      "infrt.return" (%m)
  *  } ...
- *  %f = "trt.create_engine"(%a) {
+ *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "Infrt.return" (%m)
+ *      "infrt.return" (%m)
  *  } ...
- *  "pd.fetch" (%d, %f)
+ *  "infrt.return" (%d, %f)
  * }
  * TODO(winter-wang): Supplementary how to judge the operators can be supported
  * by tensorrt.
@@ -57,9 +54,7 @@ namespace trt {
 class TRTOpTellerPass
     : public mlir::PassWrapper<TRTOpTellerPass, mlir::FunctionPass> {
  public:
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<TensorRTDialect, ::infrt::dialect::INFRTDialect>();
-  }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {}
   ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
   void runOnFunction() override;
 };
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index 35b7967892cafc..d5222976625a2a 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -11,25 +11,58 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/Matchers.h>
 #include <mlir/IR/OpImplementation.h>
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/Interfaces/CallInterfaces.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
 namespace trt {
 
+EngineType EngineType::get() {
+  return Base::get(::infrt::Global::getMLIRContext());
+}
+
+EngineType EngineType::get(mlir::MLIRContext *context) {
+  return Base::get(context);
+}
+
 TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context)
     : mlir::Dialect("trt", context, mlir::TypeID::get<TensorRTDialect>()) {
+  addTypes<EngineType>();
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
       >();
 }
 
+mlir::Type TensorRTDialect::parseType(mlir::DialectAsmParser &parser) const {
+  llvm::StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return mlir::Type();
+  // parse trt dilaect types, for example: !trt.engine
+  if (keyword == "engine") {
+    return infrt::trt::EngineType::get(getContext());
+  }
+  parser.emitError(parser.getCurrentLocation(), "unknown infrt::trt type: ")
+      << keyword;
+  return mlir::Type();
+}
+
+void TensorRTDialect::printType(mlir::Type type,
+                                mlir::DialectAsmPrinter &printer) const {
+  // print trt dilaect types, for example: !trt.engien
+  if (type.isa<infrt::trt::EngineType>()) {
+    printer << "engine";
+    return;
+  }
+  llvm_unreachable("unknown infrt::trt type.");
+}
+
 }  // namespace trt
 }  // namespace infrt
 
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index 95b2ed41fdfe9c..44444232915bad 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -29,14 +29,19 @@
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 #include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd_ops.h"
 
 namespace infrt {
 namespace trt {
 
 class TensorRTDialect : public mlir::Dialect {
  public:
-  explicit TensorRTDialect(mlir::MLIRContext* context);
+  explicit TensorRTDialect(mlir::MLIRContext *context);
   static llvm::StringRef getDialectNamespace() { return "trt"; }
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const;  // NOLINT
+  void printType(mlir::Type type,
+                 mlir::DialectAsmPrinter &printer) const;  // NOLINT
 };
 
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
index 31142a5157bfcd..132a1d7805bdb8 100755
--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -7,14 +7,24 @@ include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
 
-def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::dialect::ReturnOp">]> {
-  let summary = "trt Graph Op";
+
+def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
+  let summary = "trt CreateEngine Op";
   let description = [{
     Describe a tensorrt subgraph.
   }];
   let regions = (region SizedRegion<1>:$body);
   let arguments = (ins Variadic<TRT_Tensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
-  let results = (outs Variadic<TRT_Tensor>:$outputs);
+  let results = (outs TRT_EngineType:$output);
+}
+
+def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
+  let summary = "trt execute Op";
+  let description = [{
+    Describe a tensorrt runtime.
+  }];
+  let arguments = (ins TRT_EngineType:$engine, Variadic<TRT_Tensor>:$inputs);
+  let results = (outs Variadic<TRT_Tensor>:$output);
 }
 
 def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 17e6f7cb563d25..a901c323ec03a4 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -173,6 +173,36 @@ boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
   return boost::none;
 }
 
+template <>
+boost::optional<::infrt::TargetType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::TargetAttr>()) return boost::none;
+  if (attr.isa<::infrt::TargetAttr>()) {
+    return attr.cast<::infrt::TargetAttr>().getTarget();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<::infrt::LayoutType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::LayoutAttr>()) return boost::none;
+  if (attr.isa<::infrt::LayoutAttr>()) {
+    return attr.cast<::infrt::LayoutAttr>().getLayout();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<::infrt::PrecisionType> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute& attr) {
+  if (!attr.isa<::infrt::PrecisionAttr>()) return boost::none;
+  if (attr.isa<::infrt::PrecisionAttr>()) {
+    return attr.cast<::infrt::PrecisionAttr>().getPrecision();
+  }
+  return boost::none;
+}
+
 template <>
 boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
@@ -292,6 +322,13 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
     } else if (auto v = EmitAttribute<bool>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<::infrt::TargetType>(attr.getValue())) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v =
+                   EmitAttribute<::infrt::PrecisionType>(attr.getValue())) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<::infrt::LayoutType>(attr.getValue())) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
     } else if (auto v = EmitAttribute<std::vector<int16_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
     } else if (auto v = EmitAttribute<std::vector<int32_t>>(attr.getValue())) {
diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc
index 1c36b04f366bf5..83a2a4269c3e9d 100644
--- a/paddle/infrt/host_context/paddle_mlir.cc
+++ b/paddle/infrt/host_context/paddle_mlir.cc
@@ -56,6 +56,7 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
   UpdateModelParams(program, &mainFunc);
   UpdateModelOps(program);
   UpdateModelOutputs(program);
+
   return module_;
 }
 
@@ -143,13 +144,14 @@ void MLIRModelGenImpl::UpdateModelParams(
     const infrt::paddle::framework_proto::ProgramDesc &program,
     mlir::FuncOp *mainFunc) {
   // update input vars
+  int input_index = 1;
   for (auto &op_desc : main_block_.ops()) {
     if (op_desc.type() == "feed") {
       for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
         // update input variables
         auto &in = op_desc.outputs()[var_idx];
         std::string input_var_name = in.arguments(0);
-        ::mlir::Value input_ = mainFunc->getArgument(1);
+        ::mlir::Value input_ = mainFunc->getArgument(input_index++);
         params_map_.insert(
             std::pair<std::string, mlir::Value>(input_var_name, input_));
       }
@@ -211,7 +213,6 @@ void MLIRModelGenImpl::buildOperation(
     const infrt::paddle::framework_proto::OpDesc &op_) {
   const std::string &op_name = "pd." + op_.type();
   mlir::Location loc = mlir::UnknownLoc::get(context_);
-
   llvm::SmallVector<mlir::Value, 4> operands = GetOpInputValue(op_);
   llvm::SmallVector<mlir::Type, 4> resultTypes = GetOpOutputType(op_);
   llvm::SmallVector<mlir::NamedAttribute, 4> attrs = GetOpAttributes(op_);
@@ -227,7 +228,6 @@ llvm::SmallVector<mlir::Value, 4> MLIRModelGenImpl::GetOpInputValue(
   std::unordered_map<std::string, uint8_t> inputs_info = {};
   if (pd_dialect_inputs_info_map_.count(op_.type()))
     inputs_info = pd_dialect_inputs_info_map_.at(op_.type());
-
   for (int var_idx = 0; var_idx < op_.inputs_size(); ++var_idx) {
     auto &var = op_.inputs(var_idx);
     if (!var.arguments().empty()) {
@@ -249,10 +249,8 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetOpOutputType(
   // update op outputs info
   for (int var_idx = 0; var_idx < op_.outputs_size(); ++var_idx) {
     auto &var_name = op_.outputs(var_idx).arguments()[0];
-
     if (!pd_dialect_outputs_info.count(op_.outputs(var_idx).parameter()))
       continue;
-
     // update persistable tensors
     for (int i = 0; i < main_block_.vars_size(); i++) {
       auto var_desc = main_block_.vars(i);
@@ -315,7 +313,6 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
   llvm::ArrayRef<mlir::StringAttr> attr_names_ =
       registered_op_name_.getAttributeNames();
   std::vector<mlir::StringAttr> attr_names_vec_ = attr_names_.vec();
-
   // update attrs
   for (int attrs_num = 0; attrs_num < op_.attrs_size(); attrs_num++) {
     auto attr_name_ = op_.attrs(attrs_num).name();
@@ -351,11 +348,17 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
 void MLIRModelGenImpl::RegisterOpOutputVars(
     const infrt::paddle::framework_proto::OpDesc &op_,
     mlir::Operation *mlir_op_) {
+  std::unordered_map<std::string, uint8_t> pd_dialect_outputs_info =
+      pd_dialect_outputs_info_map_.at(op_.type());
+
   // op outputs
   for (int var_idx = 0; var_idx < op_.outputs_size(); ++var_idx) {
+    if (!pd_dialect_outputs_info.count(op_.outputs(var_idx).parameter()))
+      continue;
     auto &var_name = op_.outputs(var_idx).arguments()[0];
+    int index = pd_dialect_outputs_info[op_.outputs(var_idx).parameter()];
     // output name
-    auto var_ = mlir_op_->getResult(var_idx);
+    auto var_ = mlir_op_->getResult(index);
     params_map_.insert(std::pair<std::string, mlir::Value>(var_name, var_));
   }
 }
diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc
index 3f40490557290f..abf0b8a9577993 100644
--- a/paddle/infrt/host_context/value.cc
+++ b/paddle/infrt/host_context/value.cc
@@ -24,14 +24,6 @@ ValueRef::ValueRef(int64_t val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(float val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(double val) : Shared<Value>(new Value(val)) {}
 ValueRef::ValueRef(bool val) : Shared<Value>(new Value(val)) {}
-ValueRef::ValueRef(backends::CpuPhiContext&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::CPUContext&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::DenseTensor&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
-ValueRef::ValueRef(::phi::MetaTensor&& val)
-    : Shared<Value>(new Value(std::move(val))) {}
 
 const char* Value::type_info() const { return __type_info__; }
 
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 0ae482349cd07e..86df3508cf8136 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -22,6 +22,7 @@
 
 #include "paddle/infrt/common/object.h"
 #include "paddle/infrt/common/shared.h"
+#include "paddle/infrt/dialect/infrt/common_type.h"
 #include "paddle/infrt/host_context/function.h"
 #include "paddle/infrt/support/variant.h"
 #include "paddle/infrt/tensor/dense_host_tensor.h"
@@ -64,10 +65,12 @@ using ValueVariantType =
             tensor::DenseHostTensor,
             MlirFunctionExecutable*,
             tensor::TensorMap,
+            ::infrt::PrecisionType,
+            ::infrt::LayoutType,
+            ::infrt::TargetType,
 #ifdef INFRT_WITH_PHI
             ::phi::MetaTensor,
             ::phi::DenseTensor,
-            backends::CpuPhiAllocator,
             backends::CpuPhiContext,
             ::phi::CPUContext,
             std::vector<const phi::DenseTensor*>,
@@ -101,6 +104,9 @@ class Value : public common::Object {
   explicit Value(float x) : data(x) {}
   explicit Value(double x) : data(x) {}
   explicit Value(bool x) : data(x) {}
+  explicit Value(::infrt::TargetType x) : data(x) {}
+  explicit Value(::infrt::LayoutType x) : data(x) {}
+  explicit Value(::infrt::PrecisionType x) : data(x) {}
   explicit Value(std::string x) : data(x) {}
   explicit Value(tensor::TensorMap&& x) : data(x) {}
   explicit Value(std::vector<int16_t>&& x) : data(x) {}
@@ -112,11 +118,10 @@ class Value : public common::Object {
   explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {}
   explicit Value(MlirFunctionExecutable* x) : data(x) {}
 #ifdef INFRT_WITH_PHI
-  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
+  explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
   explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
-  explicit Value(backends::CpuPhiAllocator&& x) : data(std::move(x)) {}
 #endif
 
   template <typename T>
@@ -179,10 +184,6 @@ class ValueRef : common::Shared<Value> {
   explicit ValueRef(float val);
   explicit ValueRef(double val);
   explicit ValueRef(bool val);
-  explicit ValueRef(::phi::MetaTensor&& val);
-  explicit ValueRef(backends::CpuPhiContext&& x);
-  explicit ValueRef(::phi::CPUContext&& x);
-  explicit ValueRef(::phi::DenseTensor&& x);
 
   using common::Shared<Value>::get;
   using common::Shared<Value>::Reset;
diff --git a/paddle/infrt/kernel/phi/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt
index 7055c0c06d5905..15882d23743b02 100644
--- a/paddle/infrt/kernel/phi/CMakeLists.txt
+++ b/paddle/infrt/kernel/phi/CMakeLists.txt
@@ -8,7 +8,6 @@ gather_srcs(infrt_src SRCS
     registry.cc
     dense_tensor_kernels.cc
     context_kernels.cc
-    allocator_kernels.cc
 )
 
 set(infrt_register_phi_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc)
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 3caaf1788e3f8a..39ef172fadef9e 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -18,12 +18,11 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext(
-    infrt::backends::CpuPhiAllocator* allocator) {
-  ::phi::CPUContext context;
-  context.SetAllocator(allocator);
-  context.Init();
-  return context;
+::phi::CPUContext CreateCPUContext() {
+  ::phi::CPUContext ctx{};
+  ctx.Init();
+  ctx.SetAllocator(new backends::CpuPhiAllocator{});
+  return ctx;
 }
 
 }  // namespace phi
diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
index 7f1e7ef6cd356c..3e9580b91da572 100644
--- a/paddle/infrt/kernel/phi/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -16,13 +16,14 @@
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/backends/host/phi_context.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext(::infrt::backends::CpuPhiAllocator*);
+::phi::CPUContext CreateCPUContext();
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index 871336e8762e87..e89ee7cfe5d6f5 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -13,20 +13,25 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
-#include <iostream>
+#include "paddle/infrt/dialect/phi/data_type.h"
+#include "paddle/infrt/kernel/phi/context_kernels.h"
+
 namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPhiAllocator* allocator,
+::phi::DenseTensor CreateDenseTensor(
+    const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
-    host_context::Attribute<std::vector<int64_t>> lod) {
-  return ::phi::DenseTensor(allocator,
-                            ::phi::DenseTensorMeta(::phi::DataType::FLOAT32,
-                                                   ::phi::make_ddim(dims.get()),
-                                                   ::phi::DataLayout::NCHW,
-                                                   {}));
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::PrecisionType> precision) {
+  return ::phi::DenseTensor(
+      const_cast<::phi::Allocator*>(&context.GetAllocator()),
+      ::phi::DenseTensorMeta(cvtPrecision2Phi(precision.get()),
+                             ::phi::make_ddim(dims.get()),
+                             cvtLayout2Phi(layout.get()),
+                             {}));
 }
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 920c0b1c8af427..187e5c64511e83 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
+#include "paddle/infrt/dialect/infrt/common_type.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -22,10 +23,12 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
-    backends::CpuPhiAllocator* allocator,
+::phi::DenseTensor CreateDenseTensor(
+    const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
-    host_context::Attribute<std::vector<int64_t>> lod);
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::PrecisionType> precision);
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<float>> values);
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index 37f9197edb728b..08c2e19deddfe4 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) {
   host_context::KernelRegistry registry;
   RegisterInferShapeLaunchers(&registry);
   ASSERT_GE(registry.size(), 1UL);
-  auto creator = registry.GetKernel("phi_cpu.add.any.float32");
+  auto creator = registry.GetKernel("phi_cpu.add.float32.any");
 
   const phi::DDim dims({1, 2});
   const phi::DataType dtype{phi::DataType::FLOAT32};
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
index 165f7f7c94377f..75e3ebbf00ca54 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
@@ -24,7 +24,8 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape(
        frame->GetValues(1, frame->GetNumElements() - 1)) {
     // TODO(Superjomn) To extend this.
     if (value->is_type<::phi::DenseTensor>()) {
-      values.emplace_back(::phi::MetaTensor{&value->get<::phi::DenseTensor>()});
+      values.emplace_back(new host_context::Value{
+          ::phi::MetaTensor{&value->get<::phi::DenseTensor>()}});
       infershape_kernel_frame_builder.AddArgument(values.back().get());
     } else {
       infershape_kernel_frame_builder.AddArgument(value);
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index cb09275c170d81..90570484179d1e 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -19,7 +19,6 @@
 
 #include "paddle/infrt/host_context/kernel_registry.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
-#include "paddle/infrt/kernel/phi/allocator_kernels.h"
 #include "paddle/infrt/kernel/phi/context_kernels.h"
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
 #include "paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h"
@@ -33,13 +32,10 @@ namespace infrt {
 namespace kernel {
 
 void RegisterPhiKernels(host_context::KernelRegistry* registry) {
-  registry->AddKernel("phi_dt.create_allocator.cpu",
-                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuAllocator));
   registry->AddKernel("phi_dt.create_context.cpu",
-                      INFRT_KERNEL(infrt::kernel::phi::CreateCpuContext));
-  registry->AddKernel(
-      "phi_dt.create_dense_tensor.cpu.f32.nchw",
-      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw));
+                      INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext));
+  registry->AddKernel("phi_dt.create_dense_tensor",
+                      INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor));
   registry->AddKernel("phi_dt.fill_dense_tensor.f32",
                       INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
   registry->AddKernel("phi_dt.print_tensor",
diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
similarity index 53%
rename from paddle/infrt/tests/dialect/pten/dense_tensor.mlir
rename to paddle/infrt/tests/dialect/phi/dense_tensor.mlir
index 586af7a9c50c2d..e8f09f07c82c40 100644
--- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
@@ -2,11 +2,12 @@
 
 // CHECK-LABEL: @sign_any_float32_execute
 func @sign_any_float32_execute() {
-  %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.allocator<CPU>
-  %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.allocator<CPU>) -> !phi.context<CPU>
-  %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.allocator<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+  %t = "phi_dt.create_dense_tensor" (%ctx) {
+    precision=#infrt.precision<FP32>, 
+    layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
   "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
-  %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  %e = "phi_cpu.sign.float32.any"(%ctx, %t) : (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
 
   // CHECK: dense_tensor: shape=shape[1], values=[1]
   "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
similarity index 100%
rename from paddle/infrt/tests/dialect/pten/pten_pass.mlir
rename to paddle/infrt/tests/dialect/phi/phi_pass.mlir
diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir
index 49510bc542dc04..6d25044d139f32 100644
--- a/paddle/infrt/tests/dialect/trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/trt_ops.mlir
@@ -1,13 +1,6 @@
 // RUN: trt-exec %s
 // CHECK-LABEL: @main
-func @main() -> tensor<?xf32> {
-  %bias = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
-  %c = "pd.feed"() {name="input1"} : () -> tensor<?xf32>
-  %b1 = "pd.feed"() {name="input2"} : () -> tensor<?xf32>
-  %b2 = "pd.feed"() {name="input3"} : () -> tensor<?xf32>
-  %bias1 = "pd.feed"() {name="input4"} : () -> tensor<?xf32>
-  %bias2 = "pd.feed"() {name="input5"} : () -> tensor<?xf32>
-
+func @main(%bias:tensor<?xf32>, %c:tensor<?xf32>, %b1:tensor<?xf32>, %b2:tensor<?xf32>, %bias1:tensor<?xf32>, %bias2:tensor<?xf32>) -> tensor<?xf32> {
   %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
 
@@ -19,5 +12,5 @@ func @main() -> tensor<?xf32> {
   %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
   
-  "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
+  "infrt.return"(%e2) : (tensor<?xf32>)->()
 }
diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt
index d632db046d15ca..a1b0af609ca8d5 100644
--- a/paddle/phi/api/CMakeLists.txt
+++ b/paddle/phi/api/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_subdirectory(lib)
-cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api)
+cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api sparse_bw_api)
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 4f449c578bab00..42bf7a8103f837 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -37,8 +37,24 @@ set(sparse_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_
 set(sparse_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml)
 set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
 set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
-set(sparse_api_header_file_tmp ${api_header_file}.tmp)
-set(sparse_api_source_file_tmp ${api_source_file}.tmp)
+set(sparse_api_header_file_tmp ${sparse_api_header_file}.tmp)
+set(sparse_api_source_file_tmp ${sparse_api_source_file}.tmp)
+
+# sparse bw api file
+set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
+set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml)
+set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
+set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc)
+set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp)
+set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
+
+# sparse bw api file
+set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
+set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml)
+set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
+set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc)
+set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp)
+set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
 
 # wrapped infermeta file
 set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
@@ -91,7 +107,20 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file}
   COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}"
-  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base}
+  DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} ${api_gen_file}
+  VERBATIM)
+
+# generate backward sparse api
+add_custom_command(
+  OUTPUT ${sparse_bw_api_header_file} ${sparse_bw_api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file}
+                 --api_yaml_path ${sparse_bw_api_yaml_file}
+                 --api_header_path ${sparse_bw_api_header_file_tmp}
+                 --api_source_path ${sparse_bw_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp} ${sparse_bw_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp} ${sparse_bw_api_source_file}
+  COMMENT "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}"
+  DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base} ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file}
   VERBATIM)
 
 # generate wrapped infermeta
@@ -113,9 +142,10 @@ cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfe
 cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
 cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
 
-cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
 cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl)
 cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
 cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl)
+cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
+cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
 
 cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index ae67e2ebb35cce..79b8ac6d0b8352 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index d0787159e1e308..33daa2bba6b7d9 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -56,6 +57,9 @@ using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *;
 // Forward declaration of cuBLAS types.
 using cublasHandle_t = struct cublasContext *;
 
+// Forward declaration of cuBLASLt types.
+using cublasLtHandle_t = struct cublasLtContext *;
+
 // Forward declaration of cuSOLVER types.
 using cusolverDnHandle_t = struct cusolverDnContext *;
 
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index dbcc1660c6472c..09deb575f2414a 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -171,6 +172,7 @@ struct GPUContext::Impl {
     InitStream();
     InitEigenDevice();
     InitBlasHandle();
+    InitBlasLtHandle();
     InitDNNHandle();
     InitSolverHandle();
     InitSparseHandle();
@@ -183,6 +185,7 @@ struct GPUContext::Impl {
     InitGpuProperties();
     InitStream();
     InitBlasHandle();
+    InitBlasLtHandle();
     InitDNNHandle();
     InitSolverHandle();
     InitSparseHandle();
@@ -212,6 +215,7 @@ struct GPUContext::Impl {
     }
 #endif
     DestroyInternalBlasHandle();
+    DestroyInternalBlasLtHandle();
     DestoryInternalStream();
   }
 
@@ -418,6 +422,25 @@ struct GPUContext::Impl {
 
   void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
 
+  void InitBlasLtHandle() {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+    phi::dynload::cublasLtCreate(&blaslt_handle_);
+#endif
+  }
+
+  void DestroyInternalBlasLtHandle() {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+    phi::dynload::cublasLtDestroy(blaslt_handle_);
+#endif
+  }
+
+  void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
+
+  blasLtHandle_t GetBlasLtHandle() const {
+    PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
+    return blaslt_handle_;
+  }
+
   void InitDNNHandle() {
     if (phi::dynload::HasCUDNN()) {
 #ifdef PADDLE_WITH_HIP
@@ -679,6 +702,7 @@ struct GPUContext::Impl {
   blasHandle_t blas_handle_{nullptr};
   blasHandle_t blas_tensor_core_handle_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+  blasLtHandle_t blaslt_handle_{nullptr};
   dnnHandle_t dnn_handle_{nullptr};
   solverHandle_t solver_handle_{nullptr};
   sparseHandle_t sparse_handle_{nullptr};
@@ -725,6 +749,10 @@ blasHandle_t GPUContext::cublas_handle() const {
   return impl_->GetBlasHandle();
 }
 
+blasLtHandle_t GPUContext::cublaslt_handle() const {
+  return impl_->GetBlasLtHandle();
+}
+
 solverHandle_t GPUContext::cusolver_dn_handle() const {
   return impl_->GetSolverHandle();
 }
@@ -815,6 +843,10 @@ void GPUContext::SetBlasHandle(blasHandle_t blas) {
   impl_->SetBlasHandle(blas);
 }
 
+void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
+  impl_->SetBlasLtHandle(blaslt);
+}
+
 void GPUContext::SetDnnHandle(dnnHandle_t handle) {
   impl_->SetDnnHandle(handle);
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index b9d843982dc5eb..3eb4360ad35382 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -1,4 +1,5 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -93,6 +94,9 @@ class GPUContext : public DeviceContext {
   /*! \brief  Return cublas handle in the device context. */
   blasHandle_t cublas_handle() const;
 
+  /*! \brief  Return cublasLt handle in the device context. */
+  blasLtHandle_t cublaslt_handle() const;
+
   /*! \brief  Return cusolver handle in the device context. */
   solverHandle_t cusolver_dn_handle() const;
 
@@ -193,6 +197,8 @@ class GPUContext : public DeviceContext {
 
   void SetBlasHandle(blasHandle_t);
 
+  void SetBlasLtHandle(blasLtHandle_t);
+
   void SetDnnHandle(dnnHandle_t);
 
   void SetSolverHandle(solverHandle_t);
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 0be24392e1b40c..4a6b9d2fd87f13 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -1,4 +1,5 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -59,6 +60,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
 
 DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
 
+// TODO(Ming Huang): Since there is no blasLt handler,
+// use rocblas_handle for workround.
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+
 DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
 
 DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index cf99bb8f19af05..5f30ee4077b5c6 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -310,6 +310,10 @@ HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
   return !((isnan)(a)) && !((isinf)(a));
 }
 
+HOSTDEVICE inline bfloat16(abs)(const bfloat16& a) {
+  return bfloat16(std::abs(static_cast<float>(a)));
+}
+
 inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
   os << static_cast<float>(a);
   return os;
diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h
index d9dc103e48e737..38239f0fa9dc1c 100644
--- a/paddle/phi/common/data_type.h
+++ b/paddle/phi/common/data_type.h
@@ -82,7 +82,7 @@ inline size_t SizeOf(DataType data_type) {
   return 0;
 }
 
-#define PT_FOR_EACH_DATA_TYPE(_)    \
+#define PD_FOR_EACH_DATA_TYPE(_)    \
   _(bool, DataType::BOOL)           \
   _(int8_t, DataType::INT8)         \
   _(uint8_t, DataType::UINT8)       \
@@ -105,25 +105,25 @@ struct DataTypeToCppType;
 template <typename T>
 struct CppTypeToDataType;
 
-#define PT_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
+#define PD_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
   template <>                                                \
   struct DataTypeToCppType<data_type> {                      \
     using type = cpp_type;                                   \
   };
 
-PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_DataTypeToCppType)
+PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_DataTypeToCppType)
 
-#undef PT_SPECIALIZE_DataTypeToCppType
+#undef PD_SPECIALIZE_DataTypeToCppType
 
-#define PT_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
+#define PD_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
   template <>                                                \
   struct CppTypeToDataType<cpp_type> {                       \
     constexpr static DataType Type() { return data_type; }   \
   };
 
-PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_CppTypeToDataType)
+PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_CppTypeToDataType)
 
-#undef PT_SPECIALIZE_CppTypeToDataType
+#undef PD_SPECIALIZE_CppTypeToDataType
 
 inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
   switch (dtype) {
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 424c4ce2ebcc8a..b4a6b54d0fe3a9 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -25,7 +25,7 @@ cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
 cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
 cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 
-cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils op_registry phi_tensor_raw)
+cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory)
 
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index bbf634b4b09b90..fea79766a6b3f5 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -40,15 +40,21 @@ const std::unordered_set<std::string> standard_kernel_suffixs({
 const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "flatten",
                                                            "flatten_grad",
+                                                           "isinf",
+                                                           "isnan",
+                                                           "isfinite",
                                                            "matmul",
                                                            "matmul_grad",
                                                            "matmul_grad_grad",
                                                            "mean",
+                                                           "max",
                                                            "reshape",
                                                            "reshape_grad",
                                                            "expand",
                                                            "expand_grad",
-                                                           "sum"});
+                                                           "sum",
+                                                           "top_k",
+                                                           "top_k_grad"});
 
 class DefaultKernelSignatureMap {
  public:
@@ -166,7 +172,7 @@ struct ArgumentMappingFnRegistrar {
 };
 
 #define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)                \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
       PD_REGISTER_base_kernel_name_ns_check_##op_type,                         \
       "PD_REGISTER_BASE_KERNEL_NAME must be called in global namespace.");     \
   static const ::phi::BaseKernelNameRegistrar                                  \
@@ -174,7 +180,7 @@ struct ArgumentMappingFnRegistrar {
   int TouchBaseKernelNameSymbol_##op_type() { return 0; }
 
 #define PD_DECLARE_BASE_KERNEL_NAME(op_type)                              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       PD_DECLARE_ai_name_ns_check_##op_type,                              \
       "PD_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \
   extern int TouchBaseKernelNameSymbol_##op_type();                       \
@@ -182,7 +188,7 @@ struct ArgumentMappingFnRegistrar {
       TouchBaseKernelNameSymbol_##op_type()
 
 #define PD_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
       PD_REGISTER_arg_map_fn_ns_check_##op_type,                         \
       "PD_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \
   static const ::phi::ArgumentMappingFnRegistrar                         \
@@ -190,7 +196,7 @@ struct ArgumentMappingFnRegistrar {
   int TouchArgumentMappingFnSymbol_##op_type() { return 0; }
 
 #define PD_DECLARE_ARG_MAPPING_FN(op_type)                              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
       PD_DECLARE_arg_map_fn_ns_check_##op_type,                         \
       "PD_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \
   extern int TouchArgumentMappingFnSymbol_##op_type();                  \
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
index a333874d03ec1a..bc317da8d98ed4 100644
--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -12,21 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined _WIN32 || defined __APPLE__
-#else
-#define _LINUX
-#endif
-
 #include "paddle/phi/core/custom_kernel.h"
 
 namespace phi {
 
-void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
-  auto& kernel_info_map = custom_kernel_map.GetMap();
-  VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
+void CustomKernelMap::RegisterCustomKernel(const std::string& name,
+                                           const KernelKey& key,
+                                           const Kernel& kernel) {
+  PADDLE_ENFORCE_EQ(kernels_[name].find(key),
+                    kernels_[name].end(),
+                    phi::errors::AlreadyExists(
+                        "The custom kernel [%s:%s] has been already existed in "
+                        "CustomKernelMap, please check if any duplicate kernel "
+                        "info in your lib(s) before load again.",
+                        name,
+                        key));
+  kernels_[name][key] = kernel;
+}
+
+void CustomKernelMap::RegisterCustomKernels() {
+  VLOG(3) << "Size of custom_kernel_map: " << kernels_.size();
 
   auto& kernels = KernelFactory::Instance().kernels();
-  for (auto& pair : kernel_info_map) {
+  for (auto& pair : kernels_) {
     PADDLE_ENFORCE_NE(
         kernels.find(pair.first),
         kernels.end(),
@@ -38,8 +46,8 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
       PADDLE_ENFORCE_EQ(
           kernels[pair.first].find(info_pair.first),
           kernels[pair.first].end(),
-          phi::errors::InvalidArgument(
-              "The operator <%s>'s kernel: %s has been already existed "
+          phi::errors::AlreadyExists(
+              "The kernel [%s:%s] has been already existed "
               "in Paddle, please contribute PR if it is necessary "
               "to optimize the kernel code. Custom kernel does NOT support "
               "to replace existing kernel in Paddle.",
@@ -48,43 +56,14 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
 
       kernels[pair.first][info_pair.first] = info_pair.second;
 
-      VLOG(3) << "Successed in registering operator <" << pair.first
-              << ">'s kernel: " << info_pair.first
-              << " to Paddle. It will be used like native ones.";
+      VLOG(3) << "Successed in registering kernel [" << pair.first << ":"
+              << info_pair.first
+              << "] to Paddle. It will be used like native ones.";
     }
+    kernels_[pair.first].clear();
   }
+  LOG(INFO) << "Successed in loading custom kernels.";
+  kernels_.clear();
 }
 
-void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) {
-#ifdef _LINUX
-  typedef phi::CustomKernelMap& get_custom_kernel_map_t();
-  auto* func = reinterpret_cast<get_custom_kernel_map_t*>(
-      dlsym(dso_handle, "PD_GetCustomKernelMap"));
-
-  if (func == nullptr) {
-    LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find "
-                 << "PD_GetCustomKernelMap symbol in this lib.";
-    return;
-  }
-  auto& custom_kernel_map = func();
-  phi::RegisterCustomKernels(custom_kernel_map);
-  LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
-#else
-  VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
-#endif
-  return;
-}
 }  // namespace phi
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C-API to get global CustomKernelMap.
-phi::CustomKernelMap& PD_GetCustomKernelMap() {
-  return phi::CustomKernelMap::Instance();
-}
-
-#ifdef __cplusplus
-}  // end extern "C"
-#endif
diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h
index ffd12b9dd03a86..5ba14de6a6131c 100644
--- a/paddle/phi/core/custom_kernel.h
+++ b/paddle/phi/core/custom_kernel.h
@@ -29,6 +29,12 @@ class CustomKernelMap {
     return g_custom_kernel_info_map;
   }
 
+  void RegisterCustomKernel(const std::string& kernel_name,
+                            const KernelKey& kernel_key,
+                            const Kernel& kernel);
+
+  void RegisterCustomKernels();
+
   KernelNameMap& Kernels() { return kernels_; }
 
   const KernelNameMap& GetMap() const { return kernels_; }
@@ -40,12 +46,4 @@ class CustomKernelMap {
   KernelNameMap kernels_;
 };
 
-/**
- * Note:
- * Used to register custom kernels to KernelFactory.
- */
-void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map);
-
-// Load custom kernel lib and register
-void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle);
 }  // namespace phi
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index a5775db74382c1..9c351ce9063ecf 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -86,10 +86,10 @@ class InferMetaContext {
   paddle::SmallVector<std::pair<int, int>> output_range_;
 };
 
-#define PT_INFER_META(...) \
+#define PD_INFER_META(...) \
   ::phi::InferMetaFnImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Call
 
-#define PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(attr_type)           \
+#define PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(attr_type)           \
   template <typename... Tail>                                                  \
   struct InferMetaFnCallHelper<attr_type, Tail...> {                           \
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs> \
@@ -175,24 +175,24 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   };
 
   // TODO(chenweihang): support other attr type later
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(int64_t);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::string&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
       const std::vector<int64_t>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(
       const std::vector<std::string>&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const Scalar&);
-  PT_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataType);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(Backend);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(DataLayout);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const Scalar&);
+  PD_SPECIALIZE_InferMetaFnCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
 
   // TODO(chenweihang): support vector<MetaTensor> input later
 
@@ -304,11 +304,11 @@ struct InferMetaFnRegistrar {
 };
 
 #define PD_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
       PD_REGISTER_infer_meta_fn_ns_check_##kernel_name_prefix,                \
       "PD_REGISTER_INFER_META_FN must be called in global namespace.");       \
   static const ::phi::InferMetaFnRegistrar                                    \
       __registrar_arg_map_fn_for_##kernel_name_prefix(                        \
-          #kernel_name_prefix, PT_INFER_META(variadic_infer_meta_fn))
+          #kernel_name_prefix, PD_INFER_META(variadic_infer_meta_fn))
 
 }  // namespace phi
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 35e170a3fce563..d9ed68593cd610 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -210,7 +210,8 @@ struct KernelRegistrar {
     if (reg_type == RegType::INNER) {
       KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
     } else {
-      CustomKernelMap::Instance().Kernels()[kernel_name][kernel_key] = kernel;
+      CustomKernelMap::Instance().RegisterCustomKernel(
+          kernel_name, kernel_key, kernel);
     }
   }
 };
@@ -228,13 +229,13 @@ struct KernelRegistrar {
  *   http://connect.microsoft.com/VisualStudio/feedback/details/380090/variadic-macro-replacement
  *   http://cplusplus.co.il/2010/07/17/variadic-macro-to-count-number-of-arguments/#comment-644
  */
-#define PT_NARGS(...) _PT_NARGS((__VA_ARGS__, _PT_RESQ_N()))
-#define _PT_NARGS(...) _PT_ARG_N(__VA_ARGS__)
-#define _PT_ARG_N_EXPAND(                                                     \
+#define PD_NARGS(...) _PD_NARGS((__VA_ARGS__, _PD_RESQ_N()))
+#define _PD_NARGS(...) _PD_ARG_N(__VA_ARGS__)
+#define _PD_ARG_N_EXPAND(                                                     \
     _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, N, ...) \
   N
-#define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
-#define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#define _PD_ARG_N(args) _PD_ARG_N_EXPAND args
+#define _PD_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 /** PD_REGISTER_KERNEL
  *
@@ -256,10 +257,10 @@ struct KernelRegistrar {
 
 #define _PD_REGISTER_KERNEL(                                               \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)  \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
       PD_REGISTER_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_REGISTER_KERNEL must be called in global namespace.");           \
-  PT_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type,                              \
+  PD_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type,                              \
                                     kernel_name,                           \
                                     backend,                               \
                                     context,                               \
@@ -270,19 +271,19 @@ struct KernelRegistrar {
 #ifndef _WIN32
 #define _PD_REGISTER_2TA_KERNEL(                                            \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
-  PT_KERNEL_REGISTRAR_INIT(                                                 \
+  PD_KERNEL_REGISTRAR_INIT(                                                 \
       reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
       context,                                                              \
       layout,                                                               \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
+      &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
       __VA_ARGS__);                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
 /**
@@ -299,119 +300,119 @@ struct KernelRegistrar {
  */
 #define _PD_REGISTER_2TA_KERNEL(                                            \
     reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
-  PT_EXPAND(PT_KERNEL_REGISTRAR_INIT(                                       \
+  PD_EXPAND(PD_KERNEL_REGISTRAR_INIT(                                       \
       reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
       context,                                                              \
       layout,                                                               \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
+      &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
       __VA_ARGS__));                                                        \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \
-  _PT_KERNEL_INSTANTIATION(                                            \
-      PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__)
+#define PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \
+  _PD_KERNEL_INSTANTIATION(                                            \
+      PD_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                             \
+#define _PD_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \
+  PD_CONCATENATE(_PD_KERNEL_INSTANTIATION_, N)                             \
   (meta_kernel_fn, backend, context, __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION_1(              \
+#define _PD_KERNEL_INSTANTIATION_1(              \
     meta_kernel_fn, backend, context, cpp_dtype) \
   template decltype(                             \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>
-#define _PT_KERNEL_INSTANTIATION_2(                                           \
+#define _PD_KERNEL_INSTANTIATION_2(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_1(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_3(                                           \
+#define _PD_KERNEL_INSTANTIATION_3(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_2(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_4(                                           \
+#define _PD_KERNEL_INSTANTIATION_4(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_3(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_5(                                           \
+#define _PD_KERNEL_INSTANTIATION_5(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_4(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_6(                                           \
+#define _PD_KERNEL_INSTANTIATION_6(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_5(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_7(                                           \
+#define _PD_KERNEL_INSTANTIATION_7(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_6(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_8(                                           \
+#define _PD_KERNEL_INSTANTIATION_8(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_7(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_9(                                           \
+#define _PD_KERNEL_INSTANTIATION_9(                                           \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_8(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_10(                                          \
+#define _PD_KERNEL_INSTANTIATION_10(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(                                       \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_9(                                       \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_11(                                          \
+#define _PD_KERNEL_INSTANTIATION_11(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_10(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_12(                                          \
+#define _PD_KERNEL_INSTANTIATION_12(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_11(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_13(                                          \
+#define _PD_KERNEL_INSTANTIATION_13(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_12(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_14(                                          \
+#define _PD_KERNEL_INSTANTIATION_14(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_13(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_15(                                          \
+#define _PD_KERNEL_INSTANTIATION_15(                                          \
     meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
   template decltype(                                                          \
       meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(                                      \
+  PD_EXPAND(_PD_KERNEL_INSTANTIATION_14(                                      \
       meta_kernel_fn, backend, context, __VA_ARGS__))
 
-#define PT_KERNEL_REGISTRAR_INIT(reg_type,                   \
+#define PD_KERNEL_REGISTRAR_INIT(reg_type,                   \
                                  kernel_name,                \
                                  backend,                    \
                                  context,                    \
@@ -419,7 +420,7 @@ struct KernelRegistrar {
                                  args_def_fn,                \
                                  meta_kernel_fn,             \
                                  ...)                        \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__), \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT(PD_NARGS(__VA_ARGS__), \
                                       reg_type,              \
                                       kernel_name,           \
                                       backend,               \
@@ -433,7 +434,7 @@ struct KernelRegistrar {
 
 /* The =pre-commit always treats this macro into the wrong format,
   and multi-line macros cannot be skipped with NOLINT.*/
-#define _PT_KERNEL_REGISTRAR_INIT(N,                       \
+#define _PD_KERNEL_REGISTRAR_INIT(N,                       \
                                   reg_type,                \
                                   kernel_name,             \
                                   backend,                 \
@@ -442,20 +443,20 @@ struct KernelRegistrar {
                                   args_def_fn,             \
                                   meta_kernel_fn,          \
                                   ...)                     \
-  PT_EXPAND(PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
+  PD_EXPAND(PD_CONCATENATE(_PD_KERNEL_REGISTRAR_INIT_, N) ( \
     reg_type,                                              \
     kernel_name,                                           \
     backend,                                               \
     context,                                               \
     layout,                                                \
-    PT_ID,                                                 \
+    PD_ID,                                                 \
     args_def_fn,                                           \
     meta_kernel_fn,                                        \
     __VA_ARGS__))
 
 // clang-format on
 
-#define _PT_KERNEL_REGISTRAR_INIT_1(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_1(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -464,7 +465,7 @@ struct KernelRegistrar {
                                     args_def_fn,                              \
                                     meta_kernel_fn,                           \
                                     cpp_dtype)                                \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -474,10 +475,10 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; }
-#define _PT_KERNEL_REGISTRAR_INIT_2(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_2(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -487,7 +488,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -497,18 +498,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_1(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_3(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_3(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -518,7 +519,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -528,18 +529,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_2(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_4(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_4(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -549,7 +550,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -559,18 +560,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_3(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_5(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_5(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -580,7 +581,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -590,18 +591,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_4(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_6(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_6(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -611,7 +612,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -621,18 +622,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_5(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_7(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_7(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -642,7 +643,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -652,18 +653,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_6(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_8(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_8(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -673,7 +674,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -683,18 +684,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_7(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_9(reg_type,                                 \
+#define _PD_KERNEL_REGISTRAR_INIT_9(reg_type,                                 \
                                     kernel_name,                              \
                                     backend,                                  \
                                     context,                                  \
@@ -704,7 +705,7 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                           \
                                     cpp_dtype,                                \
                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -714,18 +715,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_8(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_10(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_10(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -735,7 +736,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -745,18 +746,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(reg_type,                             \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_9(reg_type,                             \
                                         kernel_name,                          \
                                         backend,                              \
                                         context,                              \
                                         layout,                               \
-                                        PT_ID,                                \
+                                        PD_ID,                                \
                                         args_def_fn,                          \
                                         meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_11(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_11(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -766,7 +767,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -776,18 +777,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_10(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_12(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_12(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -797,7 +798,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -807,18 +808,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_11(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_13(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_13(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -828,7 +829,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -838,18 +839,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_12(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_14(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_14(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -859,7 +860,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -869,18 +870,18 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_13(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_15(reg_type,                                \
+#define _PD_KERNEL_REGISTRAR_INIT_15(reg_type,                                \
                                      kernel_name,                             \
                                      backend,                                 \
                                      context,                                 \
@@ -890,7 +891,7 @@ struct KernelRegistrar {
                                      meta_kernel_fn,                          \
                                      cpp_dtype,                               \
                                      ...)                                     \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+  static const ::phi::KernelRegistrar PD_CONCATENATE(                         \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
       reg_type,                                                               \
       #kernel_name,                                                           \
@@ -900,14 +901,14 @@ struct KernelRegistrar {
       ::phi::KernelArgsParseFunctor<decltype(                                 \
           &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(reg_type,                            \
+      PHI_KERNEL(meta_kernel_fn<cpp_dtype, context>),                         \
+      PHI_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));               \
+  PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_14(reg_type,                            \
                                          kernel_name,                         \
                                          backend,                             \
                                          context,                             \
                                          layout,                              \
-                                         PT_ID,                               \
+                                         PD_ID,                               \
                                          args_def_fn,                         \
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
@@ -924,7 +925,7 @@ struct KernelRegistrar {
 
 #define _PD_REGISTER_GENERAL_KERNEL(                                         \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)                \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
       PD_REGISTER_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
   __PD_REGISTER_GENERAL_KERNEL(                                              \
@@ -934,7 +935,7 @@ struct KernelRegistrar {
 #define __PD_REGISTER_GENERAL_KERNEL(                                       \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   template decltype(kernel_fn) kernel_fn;                                   \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
@@ -943,18 +944,18 @@ struct KernelRegistrar {
           #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
-          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
-          PT_KERNEL(kernel_fn),                                             \
-          PT_VARIADIC_KERNEL(kernel_fn));                                   \
+          &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
+          PHI_KERNEL(kernel_fn),                                            \
+          PHI_VARIADIC_KERNEL(kernel_fn));                                  \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {         \
     return 0;                                                               \
   }                                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
 #define __PD_REGISTER_GENERAL_KERNEL(                                       \
     reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+  static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
@@ -963,13 +964,13 @@ struct KernelRegistrar {
           #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
-          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
-          PT_KERNEL(kernel_fn),                                             \
-          PT_VARIADIC_KERNEL(kernel_fn));                                   \
+          &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
+          PHI_KERNEL(kernel_fn),                                            \
+          PHI_VARIADIC_KERNEL(kernel_fn));                                  \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {         \
     return 0;                                                               \
   }                                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+  void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
@@ -979,7 +980,7 @@ struct KernelRegistrar {
  * to avoid being removed by linker
  */
 #define PD_DECLARE_KERNEL(kernel_name, backend, layout)                   \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+  PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       PD_DECLARE_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_DECLARE_KERNEL must be called in global namespace.");           \
   extern int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout(); \
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index f7fa27b0744b6a..2cc82772cf8aa0 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -30,14 +30,15 @@
 
 namespace phi {
 
-#define PT_KERNEL(...) \
+// PD_KERNEL has been used by custom op api
+#define PHI_KERNEL(...) \
   ::phi::KernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
 
-#define PT_VARIADIC_KERNEL(...)                                      \
+#define PHI_VARIADIC_KERNEL(...)                                     \
   reinterpret_cast<void*>(&::phi::KernelImpl<decltype(&__VA_ARGS__), \
                                              &__VA_ARGS__>::VariadicCompute)
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)           \
   template <typename... Tail>                                                \
   struct KernelCallHelper<const dev_ctx&, Tail...> {                         \
     template <int dev_ctx_idx,                                               \
@@ -60,7 +61,7 @@ namespace phi {
     }                                                                        \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)           \
   template <typename... Tail>                                           \
   struct KernelCallHelper<const tensor_type&, Tail...> {                \
     template <int dev_ctx_idx,                                          \
@@ -81,7 +82,7 @@ namespace phi {
     }                                                                   \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type)     \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type)     \
   template <typename... Tail>                                              \
   struct KernelCallHelper<paddle::optional<const tensor_type&>, Tail...> { \
     template <int dev_ctx_idx,                                             \
@@ -102,7 +103,7 @@ namespace phi {
     }                                                                      \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)          \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)          \
   template <typename... Tail>                                                \
   struct KernelCallHelper<const std::vector<const tensor_type*>&, Tail...> { \
     template <int dev_ctx_idx,                                               \
@@ -124,7 +125,7 @@ namespace phi {
     }                                                                        \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
   template <typename... Tail>                                             \
   struct KernelCallHelper<attr_type, Tail...> {                           \
     template <int dev_ctx_idx,                                            \
@@ -142,7 +143,7 @@ namespace phi {
     }                                                                     \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)           \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)           \
   template <typename... Tail>                                            \
   struct KernelCallHelper<tensor_type*, Tail...> {                       \
     template <int dev_ctx_idx,                                           \
@@ -159,7 +160,7 @@ namespace phi {
     }                                                                    \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)          \
+#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)          \
   template <typename... Tail>                                                 \
   struct KernelCallHelper<std::vector<tensor_type*>, Tail...> {               \
     template <int dev_ctx_idx,                                                \
@@ -204,65 +205,66 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   /* DeviceContext Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
 #endif
 #ifdef PADDLE_WITH_XPU
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
 #endif
 
   /* Input Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCooTensor);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SparseCsrTensor);
 
   /* Attribute Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<std::string>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<std::string>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<Scalar>&);
 
   /* Output Helpers */
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
 
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCsrTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCsrTensor);
 
   /* End case */
   template <typename T>
diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h
index 97c5466e1deea8..8049d027a77b80 100644
--- a/paddle/phi/core/macros.h
+++ b/paddle/phi/core/macros.h
@@ -26,19 +26,19 @@ namespace phi {
   classname& operator=(classname&&) = delete
 #endif
 
-#define PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
-  _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
+#define PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
+  _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
 
-#define _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                    \
+#define _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                    \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
 #ifdef __COUNTER__
-#define PT_ID __COUNTER__
+#define PD_ID __COUNTER__
 #else
-#define PT_ID __LINE__
+#define PD_ID __LINE__
 #endif
 
 #if defined(_WIN32)
@@ -48,9 +48,9 @@ namespace phi {
 #define UNUSED __attribute__((unused))
 #endif
 
-#define PT_CONCATENATE(arg1, arg2) PT_CONCATENATE1(arg1, arg2)
-#define PT_CONCATENATE1(arg1, arg2) PT_CONCATENATE2(arg1, arg2)
-#define PT_CONCATENATE2(arg1, arg2) arg1##arg2
-#define PT_EXPAND(x) x
+#define PD_CONCATENATE(arg1, arg2) PD_CONCATENATE1(arg1, arg2)
+#define PD_CONCATENATE1(arg1, arg2) PD_CONCATENATE2(arg1, arg2)
+#define PD_CONCATENATE2(arg1, arg2) arg1##arg2
+#define PD_EXPAND(x) x
 
 }  // namespace phi
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 2aadce4feda966..eb114304f53ea0 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -98,13 +98,9 @@ const LoD& MetaTensor::lod() const {
 }
 
 void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
-  if (phi::DenseTensor::classof(tensor_)) {
-    set_dims(meta_tensor.dims());
-    set_dtype(meta_tensor.dtype());
-    set_layout(meta_tensor.layout());
-    share_lod(meta_tensor);
-  } else if (phi::SelectedRows::classof(tensor_)) {
-    set_dims(meta_tensor.dims());
+  if (phi::DenseTensor::classof(tensor_) ||
+      phi::SelectedRows::classof(tensor_)) {
+    share_dims(meta_tensor);
     set_dtype(meta_tensor.dtype());
     set_layout(meta_tensor.layout());
     share_lod(meta_tensor);
@@ -114,4 +110,29 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
   }
 }
 
+TensorBase* MetaTensor::get_tensor() const { return tensor_; }
+
+void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
+  bool is_dense_tensor = phi::DenseTensor::classof(tensor_);
+  bool is_selected_rows = phi::SelectedRows::classof(tensor_);
+  if (is_dense_tensor || is_selected_rows) {
+    set_dims(meta_tensor.dims());
+    if (is_selected_rows) {
+      const auto in_tensor_base = meta_tensor.get_tensor();
+      PADDLE_ENFORCE_EQ(
+          phi::SelectedRows::classof(in_tensor_base),
+          true,
+          errors::InvalidArgument("The input MetaTensor is SelectedRows, but "
+                                  "the output MetaTensor is not this type."));
+      auto* selected_rows_out = static_cast<SelectedRows*>(tensor_);
+      auto* selected_rows_in = static_cast<SelectedRows*>(in_tensor_base);
+      selected_rows_out->set_rows(selected_rows_in->rows());
+      selected_rows_out->set_height(selected_rows_in->height());
+    }
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Unsupported sharing dims for `%s`.", tensor_->type_info().name()));
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 1a32019a190496..3971a9f7e99e02 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -60,12 +60,13 @@ class MetaTensor {
 
   virtual void share_lod(const MetaTensor& meta_tensor);
   virtual void share_meta(const MetaTensor& meta_tensor);
+  virtual void share_dims(const MetaTensor& meta_tensor);
 
  private:
   // Because the lod in compiletime and runtime is different,
   // so `LoD` cannot in public methods
   const LoD& lod() const;
-
+  TensorBase* get_tensor() const;
   TensorBase* tensor_;
 };
 
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 745ddffabbe33f..ff73829c4754b8 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -274,6 +274,65 @@ void HuberLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  auto x_dims_n = x_dims.size();
+  auto y_dims_n = y_dims.size();
+
+  PADDLE_ENFORCE_GE(x_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor X's dimensions of TriangularSolveOp "
+                        "should be >= 2. But received X's "
+                        "dimensions = %d, X's shape = [%s]",
+                        x_dims.size(),
+                        x_dims));
+
+  PADDLE_ENFORCE_GE(y_dims_n,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The input tensor Y's dimensions of TriangularSolveOp "
+                        "should be >=2. But received Y's "
+                        "dimensions = %d, Y's shape = [%s]",
+                        y_dims.size(),
+                        y_dims));
+
+  PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2],
+                    x_dims[x_dims_n - 1],
+                    phi::errors::InvalidArgument(
+                        "The inner-most 2 dimensions of Input(X) all should "
+                        "be square matrices "
+                        "But received X's shape[-2] = %d and shape[-1] = %d.",
+                        x_dims[x_dims_n - 2],
+                        x_dims[x_dims_n - 1]));
+
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+
+  std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
+  std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
+
+  std::vector<int64_t> expand_batch_portion =
+      funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
+  y_broadcast_dims.insert(y_broadcast_dims.end(),
+                          {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]});
+
+  // dim of 'out' is the same with 'Y' after broadcast
+  out->set_dims(phi::make_ddim(y_broadcast_dims));
+  out->set_dtype(y.dtype());
+  out->set_layout(y.layout());
+  out->share_lod(y);
+}
+
 void IndexSampleInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           MetaTensor* out,
@@ -358,6 +417,25 @@ void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   out->share_meta(x);
 }
 
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config) {
+  auto dims = x.dims();
+  dims[0] = -1;
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+
+  if (pooltype == "MEAN") {
+    summed_ids->set_dims({-1, 1});
+    summed_ids->set_dtype(x.dtype());
+    summed_ids->set_layout(x.layout());
+  }
+}
+
 void BCELossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       MetaTensor* out,
@@ -397,6 +475,79 @@ void BCELossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out) {
+  auto input_dim = x.dims();
+
+  PADDLE_ENFORCE_GE(minlength,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The minlength should be greater than or equal to 0."
+                        "But received minlength is %d",
+                        minlength));
+
+  PADDLE_ENFORCE_EQ(
+      input_dim.size(),
+      1,
+      phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor."
+                                   "But the dimension of Input(X) is [%d]",
+                                   input_dim.size()));
+
+  if (weights.is_initialized()) {
+    auto weights_dim = weights->dims();
+    PADDLE_ENFORCE_EQ(weights_dim.size(),
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The 'shape' of Input(Weights) must be 1-D tensor."
+                          "But the dimension of Input(Weights) is [%d]",
+                          weights_dim.size()));
+
+    PADDLE_ENFORCE_EQ(
+        weights_dim[0],
+        input_dim[0],
+        phi::errors::InvalidArgument(
+            "The 'shape' of Input(Weights) must be equal to the 'shape' of "
+            "Input(X)."
+            "But received: the 'shape' of Input(Weights) is [%s],"
+            "the 'shape' of Input(X) is [%s]",
+            weights_dim,
+            input_dim));
+  }
+  out->set_dims(phi::make_ddim({-1}));
+  if (weights.is_initialized()) {
+    out->set_dtype(weights->dtype());
+  } else {
+    out->set_dtype(x.dtype());
+  }
+
+  out->share_lod(x);
+}
+
+void DistInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   float p,
+                   MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  PADDLE_ENFORCE_NE(phi::product(x_dims),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) has not been initialized properly. The "
+                        "shape of Input(X) = [%s].",
+                        x_dims));
+  PADDLE_ENFORCE_NE(phi::product(y_dims),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The Input(Y) has not been initialized properly. The "
+                        "shape of Input(Y) = [%s].",
+                        y_dims));
+  out->set_dims({1});
+  out->set_dtype(x.dtype());
+}
+
 void GatherNdInferMeta(const MetaTensor& x,
                        const MetaTensor& index,
                        MetaTensor* out) {
@@ -443,4 +594,117 @@ void GatherTreeMeta(const MetaTensor& ids,
   out->set_dims(ids_dims);
 }
 
+void LogLossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      float epsilon,
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto pred_dims = input.dims();
+  auto label_dims = label.dims();
+
+  if (config.is_runtime ||
+      (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) {
+    PADDLE_ENFORCE_EQ(
+        pred_dims,
+        label_dims,
+        phi::errors::InvalidArgument(
+            "The dimensions of Input(Predicted) must be equal to the"
+            "dimensions of Input(Labels), but received dimensions of "
+            "Input(Predicted)"
+            "is [%s], received dimensions of Input(Labels) is [%s].",
+            pred_dims,
+            label_dims));
+  }
+  PADDLE_ENFORCE_EQ(pred_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The dimensions of Input(Predicted) must be 2,"
+                        "But received dimensions of Input(Predicted)"
+                        "is [%d]",
+                        pred_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(pred_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "Each row of Input(Predicted) contains a real value, "
+                          "so the 2nd dimension of Input(X) must be 1,"
+                          "But got [%d]",
+                          pred_dims[1]));
+  }
+  out->set_dims({pred_dims[0], 1});
+  out->set_dtype(input.dtype());
+  out->share_lod(input);
+}
+
+void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_vec = vec.dims();
+  PADDLE_ENFORCE_EQ(
+      dim_x.size(),
+      2,
+      phi::errors::InvalidArgument("The rank of input X should be 2, but is %d",
+                                   dim_x.size()));
+  PADDLE_ENFORCE_EQ(
+      dim_vec.size(),
+      1,
+      phi::errors::InvalidArgument(
+          "The rank of input Vec should be 1, but is %d", dim_vec.size()));
+  PADDLE_ENFORCE_EQ(dim_x[1],
+                    dim_vec[0],
+                    phi::errors::InvalidArgument(
+                        "X's second dimension is expected to be equal to "
+                        "Vec's first dimension"
+                        "but recieved X'shape = [%s], Vec's shape = [%s]",
+                        dim_x,
+                        dim_vec));
+
+  auto dim_out = phi::make_ddim({dim_x[0]});
+
+  out->set_dims(dim_out);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
+                                            const MetaTensor& label,
+                                            bool normalize,
+                                            int ignore_index,
+                                            MetaTensor* out,
+                                            MetaConfig config) {
+  auto x_dims = x.dims();
+  auto labels_dims = label.dims();
+  int rank = x_dims.size();
+  PADDLE_ENFORCE_EQ(rank,
+                    labels_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) and Input(Label) shall have the same rank."
+                        "But received: the rank of Input(X) is [%d], "
+                        "the rank of Input(Label) is [%d].",
+                        rank,
+                        labels_dims.size()));
+
+  bool check = true;
+  if ((!config.is_runtime) &&
+      (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(
+        phi::slice_ddim(x_dims, 0, rank),
+        phi::slice_ddim(labels_dims, 0, rank),
+        phi::errors::InvalidArgument(
+            "Input(X) and Input(Label) shall have the same shape "
+            "except the last dimension. But received: the shape of "
+            "Input(X) is [%s], the shape of Input(Label) is [%s].",
+            x_dims,
+            labels_dims));
+  }
+
+  out->set_dims(x_dims);
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 2ec744636988f2..bc5cb887f2a8a9 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -62,6 +62,13 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                         MetaTensor* residual,
                         MetaConfig config = MetaConfig());
 
+void TriangularSolveInferMeta(const MetaTensor& x,
+                              const MetaTensor& y,
+                              bool upper,
+                              bool transpose,
+                              bool unitriangular,
+                              MetaTensor* out);
+
 void IndexSampleInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           MetaTensor* out,
@@ -73,11 +80,29 @@ void CrossInferMeta(const MetaTensor& x,
                     MetaTensor* out);
 
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+
+void SegmentPoolInferMeta(const MetaTensor& x,
+                          const MetaTensor& segment_ids,
+                          const std::string& pooltype,
+                          MetaTensor* out,
+                          MetaTensor* summed_ids,
+                          MetaConfig config = MetaConfig());
+
 void BCELossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
 
+void BincountInferMeta(const MetaTensor& x,
+                       const paddle::optional<const MetaTensor&> weights,
+                       int minlength,
+                       MetaTensor* out);
+
+void DistInferMeta(const MetaTensor& x,
+                   const MetaTensor& y,
+                   float p,
+                   MetaTensor* out);
+
 void GatherNdInferMeta(const MetaTensor& x,
                        const MetaTensor& index,
                        MetaTensor* out);
@@ -85,4 +110,20 @@ void GatherNdInferMeta(const MetaTensor& x,
 void GatherTreeMeta(const MetaTensor& ids,
                     const MetaTensor& parents,
                     MetaTensor* out);
+
+void LogLossInferMeta(const MetaTensor& input,
+                      const MetaTensor& label,
+                      float epsilon,
+                      MetaTensor* out,
+                      MetaConfig config = MetaConfig());
+
+void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);
+
+void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
+                                            const MetaTensor& label,
+                                            bool normalize,
+                                            int ignore_index,
+                                            MetaTensor* out,
+                                            MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index dc5478e8afb981..acce40713b8215 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -28,6 +28,178 @@ std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
   return dims;
 }
 
+void AucInferMeta(const MetaTensor& input,
+                  const MetaTensor& label,
+                  const MetaTensor& stat_pos,
+                  const MetaTensor& stat_neg,
+                  const std::string& curve,
+                  int num_thresholds,
+                  int slide_steps,
+                  MetaTensor* auc,
+                  MetaTensor* stat_pos_out,
+                  MetaTensor* stat_neg_out,
+                  MetaConfig config) {
+  auto predict_dims = input.dims();
+  auto label_dims = label.dims();
+  PADDLE_ENFORCE_GE(
+      predict_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The Input(Predict) has not been initialized properly. The "
+          "shape of Input(Predict) = [%s], the shape size must be "
+          "greater_equal 2.",
+          predict_dims));
+  auto predict_width = predict_dims[1];
+  PADDLE_ENFORCE_NE(
+      phi::product(predict_dims),
+      0,
+      phi::errors::InvalidArgument(
+          "The Input(Predict) has not been initialized properly. The "
+          "shape of Input(Predict) = [%s], the shape can not involes 0.",
+          predict_dims));
+  PADDLE_ENFORCE_NE(
+      phi::product(label_dims),
+      0,
+      phi::errors::InvalidArgument(
+          "The Input(Label) has not been initialized properly. The "
+          "shape of Input(Label) = [%s], the shape can not involes 0.",
+          label_dims));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_LE(
+        predict_width,
+        2,
+        phi::errors::InvalidArgument("Only support binary classification,"
+                                     "prediction dims[1] should be 1 or 2"));
+  }
+  auto predict_height = input.dims()[0];
+  auto label_height = label.dims()[0];
+
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        predict_height,
+        label_height,
+        phi::errors::InvalidArgument("Out and Label should have same height."));
+  }
+
+  int num_pred_buckets = num_thresholds + 1;
+
+  PADDLE_ENFORCE_GE(
+      num_pred_buckets,
+      1,
+      phi::errors::InvalidArgument("num_thresholds must larger than 1"));
+  PADDLE_ENFORCE_GE(
+      slide_steps,
+      0,
+      phi::errors::InvalidArgument("slide_steps must be natural number"));
+
+  auc->set_dims({1});
+  auc->set_dtype(DataType::INT64);
+
+  if (slide_steps) {
+    stat_pos_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1});
+    stat_pos_out->set_dtype(DataType::INT64);
+    stat_neg_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1});
+    stat_neg_out->set_dtype(DataType::INT64);
+  } else {
+    stat_pos_out->set_dims({1, num_pred_buckets});
+    stat_pos_out->set_dtype(DataType::INT64);
+    stat_neg_out->set_dims({1, num_pred_buckets});
+    stat_neg_out->set_dtype(DataType::INT64);
+  }
+}
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out) {
+  auto lr_dims = learning_rate.dims();
+  PADDLE_ENFORCE_NE(
+      product(lr_dims),
+      0,
+      errors::InvalidArgument("Maybe the Input variable LearningRate has not "
+                              "been initialized. You may need to confirm "
+                              "if you put exe.run(startup_program) "
+                              "after optimizer.minimize function."));
+  PADDLE_ENFORCE_EQ(
+      product(lr_dims),
+      1,
+      errors::InvalidArgument("Learning rate should have 1 dimension"));
+  auto beta1_pow_dims = beta1_pow.dims();
+  PADDLE_ENFORCE_EQ(product(beta1_pow_dims),
+                    1,
+                    errors::InvalidArgument(
+                        "Beta1 power accumulator should have 1 dimension"));
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and Grad input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      moment.dims(),
+      errors::InvalidArgument(
+          "Param and Moment input of AdamaxOp should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      inf_norm.dims(),
+      errors::InvalidArgument(
+          "Param and InfNorm input of AdamaxOp should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  moment_out->set_dims(param_dims);
+  moment_out->set_dtype(moment.dtype());
+
+  inf_norm_out->set_dims(param_dims);
+  inf_norm_out->set_dtype(inf_norm.dtype());
+}
+
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out) {
+  auto param_dims = param.dims();
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      grad.dims(),
+      errors::InvalidArgument(
+          "Param and grad input of AdadeltaOp should have same dimension."));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_grad.dims(),
+      errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp "
+                              "should have same dimension"));
+  PADDLE_ENFORCE_EQ(
+      param_dims,
+      avg_squared_update.dims(),
+      errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp "
+                              "should have same dimension"));
+
+  param_out->set_dims(param_dims);
+  param_out->set_dtype(param.dtype());
+
+  avg_squared_grad_out->set_dims(param_dims);
+  avg_squared_grad_out->set_dtype(avg_squared_grad.dtype());
+
+  avg_squared_update_out->set_dims(param_dims);
+  avg_squared_update_out->set_dtype(avg_squared_update.dtype());
+}
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 51738c5e08e984..26bdc62302f18a 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -20,6 +20,18 @@ namespace phi {
 
 std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
 
+void AucInferMeta(const MetaTensor& input,
+                  const MetaTensor& label,
+                  const MetaTensor& stat_pos,
+                  const MetaTensor& stat_neg,
+                  const std::string& curve,
+                  int num_thresholds,
+                  int slide_steps,
+                  MetaTensor* auc,
+                  MetaTensor* stat_pos_out,
+                  MetaTensor* stat_neg_out,
+                  MetaConfig config = MetaConfig());
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -39,4 +51,28 @@ void WhereInferMeta(const MetaTensor& condition,
                     const MetaTensor& x,
                     const MetaTensor& y,
                     MetaTensor* out);
+
+void AdamaxInferMeta(const MetaTensor& param,
+                     const MetaTensor& grad,
+                     const MetaTensor& learning_rate,
+                     const MetaTensor& moment,
+                     const MetaTensor& inf_norm,
+                     const MetaTensor& beta1_pow,
+                     float beta1,
+                     float beta2,
+                     float epsilon,
+                     MetaTensor* param_out,
+                     MetaTensor* moment_out,
+                     MetaTensor* inf_norm_out);
+
+void AdadeltaInferMeta(const MetaTensor& param,
+                       const MetaTensor& grad,
+                       const MetaTensor& avg_squared_grad,
+                       const MetaTensor& avg_squared_update,
+                       float rho,
+                       float epsilon,
+                       MetaTensor* param_out,
+                       MetaTensor* avg_squared_grad_out,
+                       MetaTensor* avg_squared_update_out);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index 0c48c9d0c7eae5..506d3fd14ea3fd 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -40,4 +40,29 @@ void EyeInferMeta(int64_t num_rows,
   out->set_dims({num_rows, num_columns});
   out->set_dtype(dtype);
 }
+
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out) {
+  auto out_dims = phi::make_ddim(shape);
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(DataLayout::NCHW);
+}
+
+void GaussianRandomInferMeta(const ScalarArray& shape,
+                             float mean,
+                             float std,
+                             int seed,
+                             DataType dtype,
+                             MetaTensor* out) {
+  auto out_dims = phi::make_ddim(shape.GetData());
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+  out->set_layout(DataLayout::NCHW);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index 40d6ea595c0c95..bd0567486e4d62 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -40,4 +40,18 @@ void EyeInferMeta(int64_t num_rows,
                   DataType dtype,
                   MetaTensor* out);
 
+void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
+                                      float mean,
+                                      float std,
+                                      int seed,
+                                      DataType dtype,
+                                      MetaTensor* out);
+
+void GaussianRandomInferMeta(const ScalarArray& shape,
+                             float mean,
+                             float std,
+                             int seed,
+                             DataType dtype,
+                             MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index c3472a24801fd8..8baf3d7ed965b6 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -192,6 +192,53 @@ void ScatterNdAddInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void ViterbiDecodeInferMeta(const MetaTensor& input,
+                            const MetaTensor& transition,
+                            const MetaTensor& length,
+                            bool include_bos_eos_tag,
+                            MetaTensor* scores,
+                            MetaTensor* path,
+                            MetaConfig config) {
+  auto in_dims = input.dims();
+  PADDLE_ENFORCE_EQ(in_dims.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "The rank of Input in ViterbiDecode  must be 3. But "
+                        "received Input's rank is %d.",
+                        in_dims.size()));
+  auto length_dims = length.dims();
+  PADDLE_ENFORCE_EQ(length_dims.size(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The rank of Length in ViterbiDecode must be 1. But "
+                        "received Length's rank is %d.",
+                        length_dims.size()));
+  auto transition_dims = transition.dims();
+  PADDLE_ENFORCE_EQ(
+      transition_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The rank of Transition in ViterbiDecode must be 2. But "
+          "received Transition's rank is %d.",
+          transition_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        in_dims[0],
+        length_dims[0],
+        phi::errors::InvalidArgument(
+            "The batch size of Input and Length should be equal."));
+    PADDLE_ENFORCE_EQ(in_dims[2],
+                      transition_dims[0],
+                      phi::errors::InvalidArgument(
+                          "The number of tags of Input (%d) and Transition "
+                          "(%d) should be equal.",
+                          transition_dims[0],
+                          in_dims[2]));
+  }
+  scores->set_dims(length_dims);
+  scores->set_dtype(length.dtype());
+}
+
 void LerpInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    const MetaTensor& weight,
@@ -209,4 +256,87 @@ void LerpInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out) {
+  auto s_dims = start.dims();
+  PADDLE_ENFORCE_EQ(
+      (s_dims.size() == 1) && (s_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Start) must be [1],"
+                                   "but received input shape is [%s].",
+                                   s_dims));
+  auto e_dims = stop.dims();
+  PADDLE_ENFORCE_EQ(
+      (e_dims.size() == 1) && (e_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Stop) must be [1],"
+                                   "but received input shape is [%s].",
+                                   e_dims));
+  auto step_dims = number.dims();
+  PADDLE_ENFORCE_EQ(
+      (step_dims.size() == 1) && (step_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Num) must be [1],"
+                                   "but received input shape is [%s].",
+                                   step_dims));
+  out->set_dims(phi::make_ddim({-1}));
+  out->set_dtype(start.dtype());
+}
+
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count) {
+  auto src_index_dims = src_index.dims();
+  if (src_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(src_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Src_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          src_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        src_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The Src_index should be 1D, when it is not 2D, but we get %d",
+            src_index_dims.size()));
+  }
+
+  auto dst_index_dims = dst_index.dims();
+  if (dst_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(dst_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Dst_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          dst_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        dst_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The Dst_index should be 1D, "
+                                     "when it is not 2D, but we get %d",
+                                     dst_index_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(src_index_dims[0],
+                    dst_index_dims[0],
+                    phi::errors::InvalidArgument(
+                        "Src_index and Dst_index should have the same shape."));
+
+  auto dims = x.dims();
+  out->set_dims(dims);
+  out->set_dtype(x.dtype());
+
+  if (pool_type == "MEAN") {
+    dst_count->set_dims({dims[0]});
+    dst_count->set_dtype(DataType::INT32);
+  }
+}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index cff57e1ba7078c..b54460bc9f68dd 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -53,9 +53,28 @@ void ScatterNdAddInferMeta(const MetaTensor& x,
                            const MetaTensor& updates,
                            MetaTensor* out);
 
+void ViterbiDecodeInferMeta(const MetaTensor& input,
+                            const MetaTensor& transition,
+                            const MetaTensor& length,
+                            bool include_bos_eos_tag,
+                            MetaTensor* scores,
+                            MetaTensor* path,
+                            MetaConfig config = MetaConfig());
+
 void LerpInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    const MetaTensor& weight,
                    MetaTensor* out);
 
+void LinspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       MetaTensor* out);
+
+void GraphSendRecvInferMeta(const MetaTensor& x,
+                            const MetaTensor& src_index,
+                            const MetaTensor& dst_index,
+                            const std::string& pool_type,
+                            MetaTensor* out,
+                            MetaTensor* dst_count);
 }  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 85db1547f16cc1..af035004e4bdbc 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <algorithm>
 #include <set>
+
+#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
@@ -24,6 +26,34 @@ limitations under the License. */
 
 namespace phi {
 
+void ArgsortInferMeta(const MetaTensor& input,
+                      int axis,
+                      bool descending,
+                      MetaTensor* output,
+                      MetaTensor* indices) {
+  auto in_dims = input.dims();
+  auto num_dims = in_dims.size();
+  PADDLE_ENFORCE_GE(
+      axis,
+      -num_dims,
+      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
+                                   " -num_dims(%d).",
+                                   axis,
+                                   -num_dims));
+  PADDLE_ENFORCE_LT(
+      axis,
+      num_dims,
+      phi::errors::InvalidArgument(
+          "'axis'(%d) must be less than num_dims(%d).", axis, num_dims));
+
+  output->share_dims(input);
+  output->set_dtype(input.dtype());
+  indices->share_dims(input);
+  indices->set_dtype(DataType::INT64);
+  output->share_lod(input);
+  indices->share_lod(input);
+}
+
 void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) {
   out->share_meta(x);
 }
@@ -155,6 +185,24 @@ void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) {
   out->set_layout(x.layout());
 }
 
+void CumsumInferMeta(const MetaTensor& x,
+                     int axis,
+                     bool flatten,
+                     bool exclusive,
+                     bool reverse,
+                     MetaTensor* out) {
+  auto x_dims = x.dims();
+  if (flatten) {
+    out->set_dims(phi::make_ddim({phi::product(x_dims)}));
+    out->set_dtype(x.dtype());
+  } else {
+    out->set_dims(x_dims);
+    out->set_dtype(x.dtype());
+  }
+
+  out->share_lod(x);
+}
+
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) {
   PADDLE_ENFORCE_EQ(
       product(x.dims()),
@@ -307,6 +355,11 @@ void InferMetaFromVecValue(const MetaTensor& x,
   }
 }
 
+void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(phi::make_ddim({1}));
+  out->set_dtype(DataType::BOOL);
+}
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
@@ -382,7 +435,7 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
   ReshapeInferMeta(x, shape, out, config);
 }
 
-/*  Why not use ReduceInferMetaBase directly?
+/*  Why not use SumRawInferMeta directly?
     Because we need make InferMetaFunction's args follow the design of api.yaml
 */
 void SumInferMeta(const MetaTensor& x,
@@ -391,15 +444,13 @@ void SumInferMeta(const MetaTensor& x,
                   bool keep_dim,
                   MetaTensor* out) {
   bool reduce_all = false;
-  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, dtype, out);
+  SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out);
 }
 
-void ReduceInferMetaBase(const MetaTensor& x,
-                         const std::vector<int64_t>& axis,
-                         bool keep_dim,
-                         bool reduce_all,
-                         DataType dtype,
-                         MetaTensor* out) {
+DDim ReduceInferDim(const MetaTensor& x,
+                    const std::vector<int64_t>& axis,
+                    bool keep_dim,
+                    bool reduce_all) {
   auto x_rank = x.dims().size();
 
   std::vector<int64_t> formated_axis = axis;
@@ -462,6 +513,17 @@ void ReduceInferMetaBase(const MetaTensor& x,
   }
   DDim out_dim = phi::make_ddim(out_dim_vector);
 
+  return out_dim;
+}
+
+void SumRawInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     bool reduce_all,
+                     DataType dtype,
+                     MetaTensor* out) {
+  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
+
   DataType out_dtype;
   if (dtype != DataType::UNDEFINED) {
     out_dtype = dtype;
@@ -479,20 +541,23 @@ void ReduceInferMetaBase(const MetaTensor& x,
   out->set_layout(x.layout());
 }
 
-void MeanRawInferMeta(const MetaTensor& x,
-                      const std::vector<int64_t>& axis,
-                      bool keep_dim,
-                      bool reduce_all,
-                      MetaTensor* out) {
-  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out);
+void ReduceInferMetaBase(const MetaTensor& x,
+                         const std::vector<int64_t>& axis,
+                         bool keep_dim,
+                         bool reduce_all,
+                         MetaTensor* out) {
+  DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all);
+  out->set_dims(out_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
 }
 
-void MeanInferMeta(const MetaTensor& x,
-                   const std::vector<int64_t>& axis,
-                   bool keep_dim,
-                   MetaTensor* out) {
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     MetaTensor* out) {
   bool reduce_all = false;
-  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out);
+  ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out);
 }
 
 void TransferLayoutInferMeta(const MetaTensor& x,
@@ -978,11 +1043,92 @@ void DiagInferMeta(const MetaTensor& x,
   }
 }
 
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config) {
+  const auto& x_dims = x.dims();
+
+  PADDLE_ENFORCE_GE(
+      axis,
+      -x_dims.size(),
+      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
+                                   " -Rank(X)(%d).",
+                                   axis,
+                                   -x_dims.size()));
+  PADDLE_ENFORCE_LT(axis,
+                    x_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
+                        axis,
+                        x_dims.size()));
+
+  PADDLE_ENFORCE_EQ(
+      (dtype < 0 || dtype == 2 || dtype == 3),
+      true,
+      phi::errors::InvalidArgument(
+          "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
+          "received [%s]",
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT32),
+          paddle::framework::DataTypeToString(
+              paddle::framework::proto::VarType::INT64),
+          paddle::framework::DataTypeToString(
+              static_cast<paddle::framework::proto::VarType::Type>(dtype))));
+
+  auto x_rank = x_dims.size();
+  if (axis < 0) axis += x_rank;
+  if (config.is_runtime) {
+    if (dtype == paddle::framework::proto::VarType::INT32) {
+      int64_t all_element_num = 0;
+      if (flatten) {
+        all_element_num = phi::product(x_dims);
+
+      } else {
+        all_element_num = x_dims[axis];
+      }
+      PADDLE_ENFORCE_LE(
+          all_element_num,
+          INT_MAX,
+          phi::errors::InvalidArgument(
+              "The element num of the argmin/argmax input at axis is "
+              "%d, is larger than int32 maximum value:%d, you must "
+              "set the dtype of argmin/argmax to 'int64'.",
+              all_element_num,
+              INT_MAX));
+    }
+  }
+  std::vector<int64_t> vec;
+  if (flatten) {
+    vec.emplace_back(static_cast<int64_t>(1));
+  } else {
+    for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
+    if (keepdims) {
+      vec.emplace_back(static_cast<int64_t>(1));
+    }
+    for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
+  }
+  out->set_dims(phi::make_ddim(vec));
+  if (dtype == 2) {
+    out->set_dtype(DataType::INT32);
+  } else if (dtype == 3) {
+    out->set_dtype(DataType::INT64);
+  }
+}
+
 void SizeInferMeta(const MetaTensor& input, MetaTensor* out) {
   out->set_dtype(DataType::INT64);
   out->set_dims({1});
 }
 
+void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(DataType::BOOL);
+}
+
 void PixelShuffleInferMeta(const MetaTensor& x,
                            int upscale_factor,
                            const std::string& data_format,
@@ -1082,6 +1228,49 @@ void TransposeInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v) {
+  auto input_dim = x.dims();
+  auto rank = input_dim.size();
+
+  PADDLE_ENFORCE_GE(rank,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions."
+                        "But received a %d dimension tensor.",
+                        rank));
+  PADDLE_ENFORCE_EQ(
+      input_dim[rank - 2],
+      input_dim[rank - 1],
+      phi::errors::InvalidArgument(
+          "Eigh op is designed for square matrix, consequently"
+          "inner-most 2 dimensions of Input(X) should be symmetric."
+          "But received X's shape[-2] = %d and shape[-1] = %d.",
+          input_dim[rank - 2],
+          input_dim[rank - 1]));
+
+  std::vector<int64_t> values_dim;
+
+  for (auto i = 0; i < rank - 1; i++) {
+    values_dim.emplace_back(input_dim[i]);
+  }
+  out_w->set_dims(phi::make_ddim(values_dim));
+  out_v->set_dims(input_dim);
+}
+
+void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) {
+  auto rank = condition.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank,
+      1UL,
+      phi::errors::InvalidArgument(
+          "Input(Condition) should have number of dimension at least 1"));
+  out->set_dims(phi::make_ddim({-1, rank}));
+  out->set_dtype(DataType::INT64);
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index d4e21fbd8244be..bd79bf9d6ed1da 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -32,6 +32,12 @@ class MetaConfig;
 // Because functions in this file not only can infer shape, but also need
 // infer lod or other useful data.
 
+void ArgsortInferMeta(const MetaTensor& input,
+                      int axis,
+                      bool descending,
+                      MetaTensor* output,
+                      MetaTensor* indices);
+
 void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out);
 
 // meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1]
@@ -63,12 +69,21 @@ void CopyToInferMeta(const MetaTensor& x,
 
 void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out);
 
+void CumsumInferMeta(const MetaTensor& x,
+                     int axis,
+                     bool flatten,
+                     bool exclusive,
+                     bool reverse,
+                     MetaTensor* out);
+
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out);
 
 void InferMetaFromVecValue(const MetaTensor& x,
                            const std::vector<int64_t>& shape,
                            MetaTensor* out);
 
+void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out);
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
@@ -85,23 +100,23 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
                                 MetaTensor* out,
                                 MetaConfig config = MetaConfig());
 
+void SumRawInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     bool reduce_all,
+                     DataType dtype,
+                     MetaTensor* out);
+
 void ReduceInferMetaBase(const MetaTensor& x,
                          const std::vector<int64_t>& axis,
                          bool keep_dim,
                          bool reduce_all,
-                         DataType dtype,
                          MetaTensor* out);
 
-void MeanRawInferMeta(const MetaTensor& x,
-                      const std::vector<int64_t>& axis,
-                      bool keep_dim,
-                      bool reduce_all,
-                      MetaTensor* out);
-
-void MeanInferMeta(const MetaTensor& x,
-                   const std::vector<int64_t>& axis,
-                   bool keep_dim,
-                   MetaTensor* out);
+void ReduceInferMeta(const MetaTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keep_dim,
+                     MetaTensor* out);
 
 void SumInferMeta(const MetaTensor& x,
                   const std::vector<int64_t>& axis,
@@ -138,6 +153,14 @@ void DiagInferMeta(const MetaTensor& x,
                    float padding_value,
                    MetaTensor* out);
 
+void ArgMinMaxInferMeta(const MetaTensor& x,
+                        int64_t axis,
+                        bool keepdims,
+                        bool flatten,
+                        int dtype,
+                        MetaTensor* out,
+                        MetaConfig config = MetaConfig());
+
 void SizeInferMeta(const MetaTensor& input, MetaTensor* out);
 
 void DiagonalInferMeta(
@@ -148,8 +171,17 @@ void PixelShuffleInferMeta(const MetaTensor& x,
                            const std::string& data_format,
                            MetaTensor* out);
 
+void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out);
+
 void TransposeInferMeta(const MetaTensor& x,
                         const std::vector<int>& axis,
                         MetaTensor* out);
 
+void EighInferMeta(const MetaTensor& x,
+                   const std::string& uplo,
+                   MetaTensor* out_w,
+                   MetaTensor* out_v);
+
+void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 4ffa1826a29fa3..093cb6549797d1 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -9,22 +9,43 @@ add_subdirectory(funcs)
 # phi depends all phi kernel targets
 set_property(GLOBAL PROPERTY PHI_KERNELS "")
 
-set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
+# [ 1. Common kernel compilation dependencies ]
+set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
 
-# NOTE: Some kernels depend on some targets that are not commonly used.
+# [ 2. Kernels that most kernels depend on ]
+# There are a few kernels that are very basic operations, and most of the
+# kernels depend on these kernels.
+set(COMMON_BAISC_KERNELS empty_kernel full_kernel)
+kernel_library(empty_kernel DEPS ${COMMON_KERNEL_DEPS})
+kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
+
+# [ 3. Kernels with special dependencies ]
+# Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel)
+set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel segment_pool_kernel segment_pool_grad_kernel matrix_power_kernel matrix_power_grad_kernel)
 kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
 kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
+kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
+kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
+kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
+kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
+kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
+kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
+kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
+kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
+kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
 
-# auto parse and build kernel targets by cmake
-register_kernels(EXCLUDES ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS})
+# 4. auto parse and build kernel targets by cmake
+register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
 
 # phi sparse kernels
 add_subdirectory(sparse)
diff --git a/paddle/phi/kernels/accuracy_kernel.h b/paddle/phi/kernels/accuracy_kernel.h
new file mode 100644
index 00000000000000..8f2dbb96f86544
--- /dev/null
+++ b/paddle/phi/kernels/accuracy_kernel.h
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& out,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total);
+}  // namespace phi
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
new file mode 100644
index 00000000000000..f34e5710ab7294
--- /dev/null
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \
+  template <typename T, typename Context>         \
+  void name##GradKernel(const Context& dev_ctx,   \
+                        const DenseTensor& x,     \
+                        const DenseTensor& dout,  \
+                        DenseTensor* dx);
+
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
+  template <typename T, typename Context>           \
+  void name##GradKernel(const Context& dev_ctx,     \
+                        const DenseTensor& out,     \
+                        const DenseTensor& dout,    \
+                        DenseTensor* dx);
+
+template <typename T, typename Context>
+void ReluDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          DenseTensor* ddout);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
new file mode 100644
index 00000000000000..bdf8f4363598f8
--- /dev/null
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+#define DECLARE_ACTIVATION_KERNEL(name)   \
+  template <typename T, typename Context> \
+  void name##Kernel(                      \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+DECLARE_ACTIVATION_KERNEL(Cos)
+DECLARE_ACTIVATION_KERNEL(Tan)
+DECLARE_ACTIVATION_KERNEL(Acos)
+DECLARE_ACTIVATION_KERNEL(Sin)
+DECLARE_ACTIVATION_KERNEL(Asin)
+DECLARE_ACTIVATION_KERNEL(Atan)
+DECLARE_ACTIVATION_KERNEL(Sinh)
+DECLARE_ACTIVATION_KERNEL(Cosh)
+DECLARE_ACTIVATION_KERNEL(Asinh)
+DECLARE_ACTIVATION_KERNEL(Acosh)
+DECLARE_ACTIVATION_KERNEL(Atanh)
+DECLARE_ACTIVATION_KERNEL(Relu)
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/adadelta_kernel.h b/paddle/phi/kernels/adadelta_kernel.h
new file mode 100644
index 00000000000000..65a6aad415193b
--- /dev/null
+++ b/paddle/phi/kernels/adadelta_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdadeltaKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& grad,
+                    const DenseTensor& avg_squared_grad,
+                    const DenseTensor& avg_squared_update,
+                    float rho,
+                    float epsilon,
+                    DenseTensor* param_out,
+                    DenseTensor* avg_squared_grad_out,
+                    DenseTensor* avg_squared_update_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/adamax_kernel.h b/paddle/phi/kernels/adamax_kernel.h
new file mode 100644
index 00000000000000..feaf996f16266a
--- /dev/null
+++ b/paddle/phi/kernels/adamax_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdamaxKernel(const Context& dev_ctx,
+                  const DenseTensor& param,
+                  const DenseTensor& grad,
+                  const DenseTensor& learning_rate,
+                  const DenseTensor& moment,
+                  const DenseTensor& inf_norm,
+                  const DenseTensor& beta1_pow,
+                  float beta1,
+                  float beta2,
+                  float epsilon,
+                  DenseTensor* param_out,
+                  DenseTensor* moment_out,
+                  DenseTensor* inf_norm_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/arg_min_max_kernel.h b/paddle/phi/kernels/arg_min_max_kernel.h
new file mode 100644
index 00000000000000..917babeef07e99
--- /dev/null
+++ b/paddle/phi/kernels/arg_min_max_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/argsort_grad_kernel.h b/paddle/phi/kernels/argsort_grad_kernel.h
new file mode 100644
index 00000000000000..b91bd69911351d
--- /dev/null
+++ b/paddle/phi/kernels/argsort_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/argsort_kernel.h b/paddle/phi/kernels/argsort_kernel.h
new file mode 100644
index 00000000000000..683e8631d2e342
--- /dev/null
+++ b/paddle/phi/kernels/argsort_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/bincount_kernel.h b/paddle/phi/kernels/bincount_kernel.h
new file mode 100644
index 00000000000000..3ba69d365480f7
--- /dev/null
+++ b/paddle/phi/kernels/bincount_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cast_kernel.h b/paddle/phi/kernels/cast_kernel.h
index c760b2842d0c97..5e07388f5fb20d 100644
--- a/paddle/phi/kernels/cast_kernel.h
+++ b/paddle/phi/kernels/cast_kernel.h
@@ -29,7 +29,7 @@ template <typename T, typename Context>
 DenseTensor Cast(const Context& dev_ctx,
                  const DenseTensor& x,
                  DataType out_dtype) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   CastInferMeta(x, out_dtype, &meta_out);
   CastKernel<T, Context>(dev_ctx, x, out_dtype, &dense_out);
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index 3b3003392d37f3..07f93f9b926f17 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -24,6 +24,12 @@ namespace phi {
 template <typename T, typename Context>
 void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
+template <typename T, typename Context>
+void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T, typename Context>
+void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
 // If T is complex
 template <
     typename T,
@@ -32,7 +38,7 @@ template <
                          std::is_same<T, phi::dtype::complex<double>>::value,
                      bool> = true>
 DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   ConjKernel<T>(dev_ctx, x, &dense_out);
@@ -50,10 +56,56 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return x;
 }
 
-template <typename T, typename Context>
-void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+// If T is complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
+                         std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  RealAndImagInferMeta(x, &meta_out);
+  RealKernel<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
 
-template <typename T, typename Context>
-void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+// If T is not complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<!std::is_same<T, phi::dtype::complex<float>>::value &&
+                         !std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Real(const Context& dev_ctx, const DenseTensor& x) {
+  return x;
+}
+
+// If T is complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
+                         std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  RealAndImagInferMeta(x, &meta_out);
+  ImagKernel<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
+
+// If T is not complex
+template <
+    typename T,
+    typename Context,
+    std::enable_if_t<!std::is_same<T, phi::dtype::complex<float>>::value &&
+                         !std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+DenseTensor Imag(const Context& dev_ctx, const DenseTensor& x) {
+  return x;
+}
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h
index ed969e963ec0e4..4e72159aeca671 100644
--- a/paddle/phi/kernels/concat_kernel.h
+++ b/paddle/phi/kernels/concat_kernel.h
@@ -38,7 +38,7 @@ DenseTensor Concat(const Context& dev_ctx,
     meta_x_ptr.push_back(&meta_x.back());
   }
 
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
   ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc
new file mode 100644
index 00000000000000..c57ec69b73a230
--- /dev/null
+++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/accuracy_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& inference,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total) {
+  int* correct_data = dev_ctx.template Alloc<int>(correct);
+  int* total_data = dev_ctx.template Alloc<int>(total);
+  float* accuracy_data = dev_ctx.template Alloc<float>(accuracy);
+
+  const int64_t* indices_data = indices.data<int64_t>();
+  const int64_t* label_data = label.data<int64_t>();
+
+  size_t num_samples = inference.dims()[0];
+  size_t class_dim = inference.dims()[1];
+  *accuracy_data = 0.0f;
+
+  if (num_samples == 0) {
+    return;
+  }
+
+  int num_correct = 0;
+  // assume inference is already the topk of the output
+  for (size_t i = 0; i < num_samples; ++i) {
+    PADDLE_ENFORCE_GE(
+        label_data[i],
+        0,
+        phi::errors::InvalidArgument(
+            "label of AccuracyOp must >= 0, But received label[%d] is %d",
+            i,
+            label_data[i]));
+    for (size_t j = 0; j < class_dim; ++j) {
+      if (indices_data[i * class_dim + j] == label_data[i]) {
+        ++num_correct;
+        break;
+      }
+    }
+  }
+
+  *correct_data = num_correct;
+  *total_data = num_samples;
+  *accuracy_data =
+      static_cast<float>(num_correct) / static_cast<float>(num_samples);
+}
+}  // namespace phi
+
+// TODO(add supported dtype.)
+PD_REGISTER_KERNEL(
+    accuracy, CPU, ALL_LAYOUT, phi::AccuracyRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
new file mode 100644
index 00000000000000..fe43ebb8160774
--- /dev/null
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+namespace phi {
+
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+  template <typename T, typename Context>                           \
+  void name##GradKernel(const Context& dev_ctx,                     \
+                        const DenseTensor& x,                       \
+                        const DenseTensor& dout,                    \
+                        DenseTensor* dx) {                          \
+    functor_class functor;                                          \
+    ActivationGradImpl<T, Context, functor_class>(                  \
+        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+  template <typename T, typename Context>                             \
+  void name##GradKernel(const Context& dev_ctx,                       \
+                        const DenseTensor& out,                       \
+                        const DenseTensor& dout,                      \
+                        DenseTensor* dx) {                            \
+    functor_class functor;                                            \
+    ActivationGradImpl<T, Context, functor_class>(                    \
+        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
+  }
+
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor<T>);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor<T>);
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {}
+PD_REGISTER_KERNEL(relu_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReluDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
new file mode 100644
index 00000000000000..51883f25183af7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/activation_impl.h"
+
+namespace phi {
+
+#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)                \
+  template <typename T, typename Context>                                \
+  void name##Kernel(                                                     \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {  \
+    functor_class functor;                                               \
+    ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+  }
+
+DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>)
+DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>)
+
+}  // namespace phi
+PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {}
+PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {}
+PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {}
+PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {}
+PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {}
+PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {}
+PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {}
+PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {}
+PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {}
+PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {}
+PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {}
+PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/adadelta_kernel.cc b/paddle/phi/kernels/cpu/adadelta_kernel.cc
new file mode 100644
index 00000000000000..e9b5397b616d72
--- /dev/null
+++ b/paddle/phi/kernels/cpu/adadelta_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    adadelta, CPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/adamax_kernel.cc b/paddle/phi/kernels/cpu/adamax_kernel.cc
new file mode 100644
index 00000000000000..867c900e70b687
--- /dev/null
+++ b/paddle/phi/kernels/cpu/adamax_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adamax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(adamax, CPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
new file mode 100644
index 00000000000000..f4ad830e149321
--- /dev/null
+++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
@@ -0,0 +1,203 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+enum ArgMinMaxType { kArgMin, kArgMax };
+
+template <typename Context,
+          typename T,
+          typename Tout,
+          int64_t Rank,
+          ArgMinMaxType argMinMaxValue>
+struct ArgMinMaxFunctor {};
+
+#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)  \
+  template <typename Context, typename T, typename Tout, int64_t Rank>    \
+  struct ArgMinMaxFunctor<Context, T, Tout, Rank, enum_argminmax_value> { \
+    void operator()(const Context& dev_ctx,                               \
+                    const DenseTensor& in,                                \
+                    DenseTensor* out,                                     \
+                    phi::DDim x_dims,                                     \
+                    int64_t axis,                                         \
+                    bool keepdims) {                                      \
+      auto in_eigen = EigenTensor<T, Rank>::From(in, x_dims);             \
+      if (keepdims) {                                                     \
+        auto out_eigen = EigenTensor<Tout, Rank>::From(*out);             \
+        out_eigen.device(*(dev_ctx.eigen_device())) =                     \
+            in_eigen.eigen_op_type(axis).template cast<Tout>();           \
+      } else {                                                            \
+        auto out_eigen = EigenTensor<Tout, Rank - 1>::From(*out);         \
+        out_eigen.device(*(dev_ctx.eigen_device())) =                     \
+            in_eigen.eigen_op_type(axis).template cast<Tout>();           \
+      }                                                                   \
+    }                                                                     \
+  }
+
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
+
+template <typename Context, typename T, ArgMinMaxType EnumArgMinMaxValue>
+struct VisitDataArgMinMaxFunctor {
+  const Context& dev_ctx;
+  const DenseTensor& x;
+  int64_t axis;
+  bool keepdims;
+  bool flatten;
+  DenseTensor* out;
+
+  explicit VisitDataArgMinMaxFunctor(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     int64_t axis,
+                                     bool keepdims,
+                                     bool flatten,
+                                     DenseTensor* out)
+      : dev_ctx(dev_ctx),
+        x(x),
+        axis(axis),
+        keepdims(keepdims),
+        flatten(flatten),
+        out(out) {}
+  template <typename Tout>
+  void apply() const {
+    dev_ctx.template Alloc<Tout>(out);
+    bool new_keepdims = keepdims;
+    if (flatten) new_keepdims = true;
+
+    // if flatten, will construct the new dims for the cacluate
+    phi::DDim x_dims;
+    int new_axis = axis;
+    if (flatten) {
+      x_dims = phi::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      new_axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) new_axis = axis + x_dims.size();
+    }
+
+#define CALL_ARG_MINMAX_FUNCTOR(rank)                                         \
+  ArgMinMaxFunctor<Context, T, Tout, rank, EnumArgMinMaxValue> functor##rank; \
+  functor##rank(dev_ctx, x, out, x_dims, new_axis, new_keepdims)
+
+    switch (x_dims.size()) {
+      case 1:
+        CALL_ARG_MINMAX_FUNCTOR(1);
+        break;
+      case 2:
+        CALL_ARG_MINMAX_FUNCTOR(2);
+        break;
+      case 3:
+        CALL_ARG_MINMAX_FUNCTOR(3);
+        break;
+      case 4:
+        CALL_ARG_MINMAX_FUNCTOR(4);
+        break;
+      case 5:
+        CALL_ARG_MINMAX_FUNCTOR(5);
+        break;
+      case 6:
+        CALL_ARG_MINMAX_FUNCTOR(6);
+        break;
+      default:
+        PADDLE_ENFORCE_LE(
+            x_dims.size(),
+            6,
+            phi::errors::InvalidArgument(
+                "%s operator doesn't supports tensors whose ranks are greater "
+                "than 6.",
+                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
+        break;
+#undef CALL_ARG_MINMAX_FUNCTOR
+    }
+  }
+};
+
+template <typename Context, typename T, ArgMinMaxType EnumArgMinMaxValue>
+void ArgMinMaxKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int64_t axis,
+                     bool keepdims,
+                     bool flatten,
+                     int dtype,
+                     DenseTensor* out) {
+  if (dtype < 0) {
+    paddle::framework::VisitDataTypeTiny(
+        static_cast<paddle::framework::proto::VarType::Type>(
+            paddle::framework::proto::VarType::INT64),
+        VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
+            dev_ctx, x, axis, keepdims, flatten, out));
+    return;
+  }
+  paddle::framework::VisitDataTypeTiny(
+      static_cast<paddle::framework::proto::VarType::Type>(dtype),
+      VisitDataArgMinMaxFunctor<Context, T, EnumArgMinMaxValue>(
+          dev_ctx, x, axis, keepdims, flatten, out));
+}
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMin>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxKernel<Context, T, ArgMinMaxType::kArgMax>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(arg_min,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgMinKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(arg_max,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgMaxKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/argsort_grad_kernel.cc b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
new file mode 100644
index 00000000000000..1e60847232c70b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullAssign(Type input_height,
+                       Type input_width,
+                       int input_dim,
+                       const DenseTensor* input,
+                       const DenseTensor* indices,
+                       T* t_out) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < input_width; ++j) {
+        t_out[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        t_out[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad) {
+  auto in_dims = indices.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  dev_ctx.template Alloc<T>(in_grad);
+  auto dxt = EigenVector<T>::Flatten(*in_grad);
+  auto& place = *dev_ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  // Do full assign
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    FullAssign<T, int64_t>(input_height,
+                           input_width,
+                           in_dims.size(),
+                           &out_grad,
+                           &indices,
+                           in_grad->data<T>());
+  } else {
+    // If not full assign do transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
+    TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    FullAssign<T, int64_t>(input_height,
+                           input_width,
+                           in_dims.size(),
+                           &trans_dO,
+                           &trans_ind,
+                           t_out);
+
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, in_grad);
+  }
+}
+
+}  // namespace phi
+PD_REGISTER_KERNEL(argsort_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/argsort_kernel.cc b/paddle/phi/kernels/cpu/argsort_kernel.cc
new file mode 100644
index 00000000000000..0e69afe38c9ad4
--- /dev/null
+++ b/paddle/phi/kernels/cpu/argsort_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullSort(Type input_height,
+                     Type input_width,
+                     int input_dim,
+                     const DenseTensor* input,
+                     T* t_out,
+                     Type* t_indices,
+                     bool descending) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.push_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.push_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    std::sort(col_vec.begin(),
+              col_vec.end(),
+              [&](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                if (descending)
+                  return l.first > r.first;
+                else
+                  return l.first < r.first;
+              });
+
+    for (Type j = 0; j < input_width; ++j) {
+      t_out[i * input_width + j] = col_vec[j].first;
+      t_indices[i * input_width + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices) {
+  auto in_dims = input.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  T* out_data = dev_ctx.template Alloc<T>(output);
+
+  // Do full sort
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);
+    FullSort<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &input,
+                         out_data,
+                         ids_data,
+                         descending);
+  } else {
+    // If not full sort do transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    // Do transpose
+    TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    tmp_indices.Resize(trans_dims);
+    auto* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    FullSort<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &trans_inp,
+                         t_out,
+                         t_ind,
+                         descending);
+
+    dev_ctx.template Alloc<int64_t>(indices);
+    TransposeKernel<int64_t, Context>(dev_ctx, tmp_indices, trans, indices);
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, output);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    argsort, CPU, ALL_LAYOUT, phi::ArgsortKernel, float, double, int, int64_t) {
+}
diff --git a/paddle/phi/kernels/cpu/bincount_kernel.cc b/paddle/phi/kernels/cpu/bincount_kernel.cc
new file mode 100644
index 00000000000000..c9dc44c1e04eb7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/bincount_kernel.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bincount_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename InputT>
+void BincountInner(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const paddle::optional<const DenseTensor&> weights,
+                   int minlength,
+                   DenseTensor* out) {
+  const DenseTensor* input = &x;
+  DenseTensor* output = out;
+  const InputT* input_data = input->data<InputT>();
+
+  auto input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    phi::DDim out_dim{0};
+    output->Resize(out_dim);
+    dev_ctx.template Alloc<InputT>(output);
+    return;
+  }
+
+  PADDLE_ENFORCE_GE(
+      *std::min_element(input_data, input_data + input_numel),
+      static_cast<InputT>(0),
+      phi::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size = static_cast<int64_t>(*std::max_element(
+                            input_data, input_data + input_numel)) +
+                        1L;
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+
+  phi::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = weights.is_initialized();
+
+  if (has_weights) {
+    const T* weights_data = weights->data<T>();
+    if (weights->dtype() == DataType::FLOAT32) {
+      float* output_data = dev_ctx.template Alloc<float>(output);
+      phi::funcs::SetConstant<Context, float>()(
+          dev_ctx, output, static_cast<float>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<float>(weights_data[i]);
+      }
+    } else {
+      double* output_data = dev_ctx.template Alloc<double>(output);
+      phi::funcs::SetConstant<Context, double>()(
+          dev_ctx, output, static_cast<double>(0));
+      for (int64_t i = 0; i < input_numel; i++) {
+        output_data[input_data[i]] += static_cast<double>(weights_data[i]);
+      }
+    }
+
+  } else {
+    int64_t* output_data = dev_ctx.template Alloc<int64_t>(output);
+    phi::funcs::SetConstant<Context, int64_t>()(dev_ctx, output, 0L);
+    for (int64_t i = 0; i < input_numel; i++) {
+      output_data[input_data[i]] += 1L;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out) {
+  if (x.dtype() == DataType::INT32) {
+    BincountInner<Context, T, int>(dev_ctx, x, weights, minlength, out);
+  } else if (x.dtype() == DataType::INT64) {
+    BincountInner<Context, T, int64_t>(dev_ctx, x, weights, minlength, out);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bincount,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BincountKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/dist_grad_kernel.cc b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
new file mode 100644
index 00000000000000..2b7f8f98f9473c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dist_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dist_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dist_kernel.cc b/paddle/phi/kernels/cpu/dist_kernel.cc
new file mode 100644
index 00000000000000..ccf3d4be832309
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dist_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(dist, CPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
new file mode 100644
index 00000000000000..b77a6c55b14716
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad) {
+  auto* grad_x = x_grad;
+  auto* grad_y = &out_grad;
+  grad_x->mutable_data<T>(dev_ctx.GetPlace());
+
+  auto dX = EigenVector<T>::Flatten(*grad_x);
+  auto dY = EigenVector<T>::Flatten(*grad_y);
+
+  auto& place = *dev_ctx.eigen_device();
+  auto& dropout_implementation = mode;
+  if (is_test == true) {
+    if (dropout_implementation == "upscale_in_train") {
+      dX.device(place) = static_cast<T>(1) * dY;
+    } else {
+      dX.device(place) = dY * static_cast<T>(1.0f - p);
+    }
+  } else {
+    auto M = EigenVector<uint8_t>::Flatten(mask);
+    if (dropout_implementation == "upscale_in_train") {
+      if (p == 1.0f) {
+        dX.device(place) = static_cast<T>(0) * dY;
+      } else {
+        dX.device(place) = dY * M.cast<T>() / static_cast<T>(1.0f - p);
+      }
+    } else {
+      dX.device(place) = dY * M.cast<T>();
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DropoutGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc
new file mode 100644
index 00000000000000..c00aedef8c67d5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/dropout_kernel.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dropout_kernel.h"
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask) {
+  auto* y = out;
+  const auto* x_data = x.data<T>();
+  auto* y_data = y->mutable_data<T>(dev_ctx.GetPlace());
+  float dropout_prob = p;
+
+  auto& dropout_implementation = mode;
+  bool upscale_in_train = (dropout_implementation == "upscale_in_train");
+  if (!is_test) {
+    auto* mask_data = mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+    size_t size = phi::product(mask->dims());
+
+    // Special case when dropout_prob is 1.0
+    if (dropout_prob == 1.0f) {
+      std::memset(y_data, 0, size * sizeof(*y_data));        // NOLINT
+      std::memset(mask_data, 0, size * sizeof(*mask_data));  // NOLINT
+      return;
+    }
+    // std::minstd_rand engine;
+    // NOTE: fixed seed should only be used in unittest or for debug.
+    // Guarantee to use random seed in training.
+    int seed_data = 0;
+    if (seed_tensor.get_ptr() != nullptr) {
+      seed_data = *(seed_tensor->data<int>());
+    } else {
+      seed_data = fix_seed ? seed : 0;
+    }
+    auto engine = paddle::framework::GetCPURandomEngine(seed_data);
+
+    std::uniform_real_distribution<float> dist(0, 1);
+
+    for (size_t i = 0; i < size; ++i) {
+      if (dist(*engine) < dropout_prob) {
+        mask_data[i] = 0;
+        y_data[i] = 0;
+      } else {
+        mask_data[i] = 1;
+        if (upscale_in_train) {
+          y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
+        } else {
+          y_data[i] = x_data[i];
+        }
+      }
+    }
+  } else {
+    if (upscale_in_train) {
+      const auto* X_data = x.data<T>();
+      auto* Y_data = y->mutable_data<T>(dev_ctx.GetPlace());
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < x.numel(); i++) {
+        Y_data[i] = X_data[i];
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      auto& place = *dev_ctx.eigen_device();
+      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DropoutRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/eigh_grad_kernel.cc b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
new file mode 100644
index 00000000000000..5135778db56c5a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/eigh_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_grad_kernel.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(eigh_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EighGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/eigh_kernel.cc b/paddle/phi/kernels/cpu/eigh_kernel.cc
new file mode 100644
index 00000000000000..92fd20ca9b8251
--- /dev/null
+++ b/paddle/phi/kernels/cpu/eigh_kernel.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_kernel.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v) {
+  bool is_lower = (uplo == "L");
+  phi::funcs::MatrixEighFunctor<Context, T> functor;
+  functor(dev_ctx, x, out_w, out_v, is_lower, true);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(eigh,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EighKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index e48ee805959088..cd513e809fd84a 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -18,7 +18,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
 
@@ -108,6 +107,34 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivGradDY<T>>(
+      dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
+}
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  auto* out = &dout;  // out is not necessary
+  phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+      dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -171,3 +198,64 @@ PD_REGISTER_KERNEL(subtract_double_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index 86576a861aa483..556de3adcf498a 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -54,12 +54,22 @@ void FullLikeKernel(const Context& dev_ctx,
 
   auto common_type_value = static_cast<CommonType>(value);
 
-  PADDLE_ENFORCE_EQ(
-      (common_type_value >=
+  // Check whether the filled value is valid
+  bool is_out_range = true;
+  if (std::isinf(value) || std::isnan(value)) {
+    is_out_range = false;
+  }
+
+  if ((common_type_value >=
        static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-          (common_type_value <=
-           static_cast<CommonType>(std::numeric_limits<T>::max())),
-      true,
+      (common_type_value <=
+       static_cast<CommonType>(std::numeric_limits<T>::max()))) {
+    is_out_range = false;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      is_out_range,
+      false,
       phi::errors::InvalidArgument(
           "The filled value is out of range for target type, "
           "current kernel type is %s, the range should between %f "
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_funcs.h b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h
new file mode 100644
index 00000000000000..df6d9c87be0ed5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+struct GraphSendRecvSumFunctor {
+  void operator()(const bool& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    eigen_dst += eigen_src;
+  }
+};
+
+template <typename T>
+struct GraphSendRecvMinFunctor {
+  void operator()(const bool& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    if (first_flag) {
+      eigen_dst += eigen_src;
+    } else {
+      eigen_dst = eigen_dst.cwiseMin(eigen_src);
+    }
+  }
+};
+
+template <typename T>
+struct GraphSendRecvMaxFunctor {
+  void operator()(const int& first_flag,
+                  const DenseTensor& src_slice,
+                  DenseTensor* dst_slice) {
+    auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = phi::EigenVector<T>::Flatten(*dst_slice);
+    if (first_flag) {
+      eigen_dst += eigen_src;
+    } else {
+      eigen_dst = eigen_dst.cwiseMax(eigen_src);
+    }
+  }
+};
+
+template <typename T, typename IndexT, typename Functor>
+void ElementwiseInnerOperation(const DenseTensor& src,
+                               DenseTensor* dst,
+                               const IndexT& src_index,
+                               const IndexT& dst_index,
+                               const bool& first_flag,
+                               Functor functor) {
+  auto src_slice = src.Slice(src_index, src_index + 1);
+  auto dst_slice = dst->Slice(dst_index, dst_index + 1);
+
+  functor(first_flag, src_slice, &dst_slice);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
new file mode 100644
index 00000000000000..8538461b1b83b8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename IndexT, typename Functor>
+void GraphSendRecvCpuGradLoop(const int& input_size,
+                              const int& index_size,
+                              const IndexT* s_index,
+                              const IndexT* d_index,
+                              const DenseTensor& src,
+                              DenseTensor* dst,
+                              const std::string& pool_type,
+                              const int* dst_count = nullptr,
+                              const DenseTensor* input = nullptr,
+                              const DenseTensor* output = nullptr) {
+  if (pool_type == "SUM") {
+    Functor functor;
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+  } else if (pool_type == "MEAN") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      auto src_slice = src.Slice(src_idx, src_idx + 1);
+      auto dst_slice = dst->Slice(dst_idx, dst_idx + 1);
+      auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst += (eigen_src / static_cast<T>(dst_count[src_idx]));
+    }
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& forward_src_idx = d_index[i];
+      const IndexT& forward_dst_idx = s_index[i];
+      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
+      auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
+      auto eigen_input = phi::EigenVector<T>::Flatten(input_slice);
+      auto eigen_output = phi::EigenVector<T>::Flatten(output_slice);
+
+      auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1);
+      auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1);
+      auto eigen_src = phi::EigenVector<T>::Flatten(src_slice);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst += eigen_src * (eigen_output == eigen_input);
+    }
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvGradOpKernelLaunchHelper(
+    const Context& ctx,
+    const DenseTensor& out_grad,
+    const DenseTensor& src_index,
+    const DenseTensor& dst_index,
+    const std::string& pool_type,
+    DenseTensor* x_grad,
+    const DenseTensor* dst_count = nullptr,
+    const DenseTensor* x = nullptr,
+    const DenseTensor* out = nullptr) {
+  const int& index_size = dst_index.dims()[0];
+
+  ctx.template Alloc<T>(x_grad);
+  T* p_output = x_grad->data<T>();
+  const auto& src_dims = out_grad.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  memset(p_output, 0, memset_bytes);
+
+  if (index_size == 0) return;
+
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+  if (pool_type == "SUM") {
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
+        src_dims[0], index_size, d_index, s_index, out_grad, x_grad, pool_type);
+  } else if (pool_type == "MEAN") {
+    const int* s_count = dst_count->data<int>();
+    // Functor not used here.
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
+                                                                    index_size,
+                                                                    d_index,
+                                                                    s_index,
+                                                                    out_grad,
+                                                                    x_grad,
+                                                                    pool_type,
+                                                                    s_count);
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    // Functor not used here.
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(src_dims[0],
+                                                                    index_size,
+                                                                    d_index,
+                                                                    s_index,
+                                                                    out_grad,
+                                                                    x_grad,
+                                                                    pool_type,
+                                                                    nullptr,
+                                                                    x,
+                                                                    out);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvGradOpKernelLaunchHelper<Context, T, int32_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvGradOpKernelLaunchHelper<Context, T, int64_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
new file mode 100644
index 00000000000000..fecbd4b1d7aa05
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h"
+
+#include <algorithm>
+#include <set>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename IndexT, typename Functor>
+void GraphSendRecvCpuLoop(const int& input_size,
+                          const int& index_size,
+                          const IndexT* s_index,
+                          const IndexT* d_index,
+                          const DenseTensor& src,
+                          DenseTensor* dst,
+                          const std::string& pool_type,
+                          int* dst_count = nullptr) {
+  Functor functor;
+  if (pool_type == "SUM") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+  } else if (pool_type == "MEAN") {
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      ElementwiseInnerOperation<T, IndexT, Functor>(
+          src, dst, src_idx, dst_idx, false, functor);
+    }
+    for (int i = 0; i < index_size; ++i) {
+      IndexT dst_idx = d_index[i];
+      *(dst_count + dst_idx) += 1;
+    }
+    for (int i = 0; i < input_size; ++i) {
+      if (*(dst_count + i) == 0) continue;
+      auto dst_slice = dst->Slice(i, i + 1);
+      auto eigen_dst = phi::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst = eigen_dst / static_cast<T>(*(dst_count + i));
+    }
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    std::set<IndexT> existed_dst;
+    for (int i = 0; i < index_size; ++i) {
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
+      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
+      if (!in_set) {
+        ElementwiseInnerOperation<T, IndexT, Functor>(
+            src, dst, src_idx, dst_idx, true, functor);
+        existed_dst.emplace(dst_idx);
+      } else {
+        ElementwiseInnerOperation<T, IndexT, Functor>(
+            src, dst, src_idx, dst_idx, false, functor);
+      }
+    }
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
+                                       const DenseTensor& x,
+                                       const DenseTensor& src_index,
+                                       const DenseTensor& dst_index,
+                                       const std::string& pool_type,
+                                       DenseTensor* out,
+                                       DenseTensor* dst_count = nullptr) {
+  const int& index_size = src_index.dims()[0];
+
+  ctx.template Alloc<T>(out);
+  T* p_output = out->data<T>();
+  const auto& src_dims = x.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  memset(p_output, 0, memset_bytes);
+
+  if (index_size == 0) return;
+
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+  if (pool_type == "SUM") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MIN") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MAX") {
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvMaxFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, x, out, pool_type);
+  } else if (pool_type == "MEAN") {
+    ctx.template Alloc<int>(dst_count);
+    int* p_dst_count = dst_count->data<int>();
+    memset(p_dst_count, 0, src_dims[0] * sizeof(int));
+    GraphSendRecvCpuLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
+                                                                index_size,
+                                                                s_index,
+                                                                d_index,
+                                                                x,
+                                                                out,
+                                                                pool_type,
+                                                                p_dst_count);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvOpKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvOpKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/isfinite_kernel.cc b/paddle/phi/kernels/cpu/isfinite_kernel.cc
new file mode 100644
index 00000000000000..33a7429a22a1a8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/isfinite_kernel.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsinfKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsnanKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/linspace_kernel.cc b/paddle/phi/kernels/cpu/linspace_kernel.cc
new file mode 100644
index 00000000000000..4b8b7f7a2e05c7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/linspace_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/linspace_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out) {
+  int32_t num = number.data<int32_t>()[0];
+  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
+  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
+
+  T start_data = start_t.template data<T>()[0];
+  T stop_data = stop_t.template data<T>()[0];
+  PADDLE_ENFORCE_GT(
+      num,
+      0,
+      phi::errors::InvalidArgument("The num of linspace op should be larger "
+                                   "than 0, but received num is %d",
+                                   num));
+
+  out->Resize(phi::make_ddim({num}));
+  T* out_data = ctx.template Alloc<T>(out);
+
+  if (num > 1) {
+    // step should be of double type for all types
+    double step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    int half_num = num / 2;
+    for (int i = 0; i < num; ++i) {
+      if (i < half_num) {
+        out_data[i] = static_cast<T>(start_data + step * i);
+      } else {
+        out_data[i] = static_cast<T>(stop_data - step * (num - i - 1));
+      }
+    }
+  } else {
+    out_data[0] = static_cast<T>(start_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(linspace,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LinspaceKernel,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double) {}
diff --git a/paddle/fluid/operators/maxout_op.cu.cc b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
similarity index 51%
rename from paddle/fluid/operators/maxout_op.cu.cc
rename to paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
index be1e81bb869a3a..ae3b4d2b45582b 100644
--- a/paddle/fluid/operators/maxout_op.cu.cc
+++ b/paddle/phi/kernels/cpu/matrix_power_grad_kernel.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/maxout_op.h"
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    maxout, ops::MaxOutKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    maxout_grad,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, double>);
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(matrix_power_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MatrixPowerGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/fluid/operators/is_empty_op.cu.cc b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
similarity index 50%
rename from paddle/fluid/operators/is_empty_op.cu.cc
rename to paddle/phi/kernels/cpu/matrix_power_kernel.cc
index 3c256503baf6ba..f40e1e616f5262 100644
--- a/paddle/fluid/operators/is_empty_op.cu.cc
+++ b/paddle/phi/kernels/cpu/matrix_power_kernel.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/is_empty_op.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/matrix_power_kernel.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    is_empty, ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IsEmptyOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    matrix_power, CPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
new file mode 100644
index 00000000000000..429344a362b1c3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    maxout_grad, CPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/maxout_kernel.cc b/paddle/phi/kernels/cpu/maxout_kernel.cc
new file mode 100644
index 00000000000000..e7cd3ab07ff598
--- /dev/null
+++ b/paddle/phi/kernels/cpu/maxout_kernel.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(maxout, CPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
new file mode 100644
index 00000000000000..2cd75404be821c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multi_dot_grad_kernel.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    multi_dot_grad, CPU, ALL_LAYOUT, phi::MultiDotGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cu b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
similarity index 59%
rename from paddle/fluid/operators/optimizers/adamax_op.cu
rename to paddle/phi/kernels/cpu/multi_dot_kernel.cc
index 80e0219d4414db..a4249a98e46dde 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cu
+++ b/paddle/phi/kernels/cpu/multi_dot_kernel.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,9 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/optimizers/adamax_op.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
+#include "paddle/phi/kernels/multi_dot_kernel.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    multi_dot, CPU, ALL_LAYOUT, phi::MultiDotKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
new file mode 100644
index 00000000000000..e7d74759f516ac
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
@@ -0,0 +1,171 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/math.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T>
+static void nll_loss_grad_1D(T* dx_data,
+                             const T* dout_data,
+                             const int64_t* label_data,
+                             const T* weight_data,
+                             const T* total_weight_data,
+                             const int64_t batch_size,
+                             const int64_t n_classes,
+                             const std::string reduction,
+                             const int64_t ignore_index) {
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      const auto cur_label = label_data[i];
+      if (cur_label == ignore_index) {
+        continue;
+      }
+      const auto cur_weight =
+          weight_data ? weight_data[cur_label] : static_cast<T>(1);
+      dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight;
+    }
+    return;
+  }
+
+  const T dout_val = *dout_data;
+  const T total_weight_val = *total_weight_data;
+  for (int i = 0; i < batch_size; i++) {
+    const auto cur_label = label_data[i];
+    if (cur_label == ignore_index) {
+      continue;
+    }
+    const auto cur_weight =
+        weight_data ? weight_data[cur_label] : static_cast<T>(1);
+    dx_data[i * n_classes + cur_label] = -dout_val * cur_weight;
+    if (reduction == "mean") {
+      dx_data[i * n_classes + cur_label] /= total_weight_val;
+    }
+  }
+}
+
+template <typename T>
+static void nll_loss_grad_2D(T* dx_data,
+                             const T* dout_data,
+                             const int64_t* label_data,
+                             const T* weight_data,
+                             const T* total_weight_data,
+                             const int64_t batch_size,
+                             const int64_t n_classes,
+                             const int64_t in_dim2,
+                             const int64_t in_dim3,
+                             const std::string& reduction,
+                             const int64_t ignore_index) {
+  const auto map_size = in_dim2 * in_dim3;
+  const auto sample_size = n_classes * map_size;
+
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      for (int h = 0; h < in_dim2; h++) {
+        for (int w = 0; w < in_dim3; w++) {
+          const auto index = i * map_size + h * in_dim3 + w;
+          const auto cur_label = label_data[index];
+          if (cur_label == ignore_index) {
+            continue;
+          }
+          const auto cur_weight =
+              weight_data ? weight_data[cur_label] : static_cast<T>(1);
+          dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] =
+              -cur_weight * dout_data[index];
+        }
+      }
+    }
+    return;
+  }
+
+  const T dout_val = *dout_data;
+  const T total_weight_val = *total_weight_data;
+  for (int i = 0; i < batch_size; i++) {
+    for (int h = 0; h < in_dim2; h++) {
+      for (int w = 0; w < in_dim3; w++) {
+        const auto index = i * map_size + h * in_dim3 + w;
+        const auto cur_label = label_data[index];
+        if (cur_label == ignore_index) {
+          continue;
+        }
+        const auto cur_weight =
+            weight_data ? weight_data[cur_label] : static_cast<T>(1);
+        const auto dx_index =
+            i * sample_size + cur_label * map_size + h * in_dim3 + w;
+        dx_data[dx_index] = -dout_val * cur_weight;
+        if (reduction == "mean") {
+          dx_data[dx_index] /= total_weight_val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& labels,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& d_out,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* dx) {
+  auto dx_data = dev_ctx.template Alloc<T>(dx);
+  auto dout_data = d_out.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto total_weight_data = total_weight.data<T>();
+  memset(dx_data, 0, dx->numel() * sizeof(T));
+
+  const auto x_dims = x.dims();
+  const auto batch_size = x_dims[0];
+  const auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    nll_loss_grad_1D(dx_data,
+                     dout_data,
+                     label_data,
+                     weight_data,
+                     total_weight_data,
+                     batch_size,
+                     n_classes,
+                     reduction,
+                     ignore_index);
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    nll_loss_grad_2D(dx_data,
+                     dout_data,
+                     label_data,
+                     weight_data,
+                     total_weight_data,
+                     batch_size,
+                     n_classes,
+                     in_dim2,
+                     in_dim3,
+                     reduction,
+                     ignore_index);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss_grad, CPU, ALL_LAYOUT, phi::NllLossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/nll_loss_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
new file mode 100644
index 00000000000000..334b0082bde57f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
@@ -0,0 +1,202 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+static void nll_loss_1D(T* out_data,
+                        T* total_weight_data,
+                        const T* x_data,
+                        const int64_t* label_data,
+                        const T* weight_data,
+                        const int64_t batch_size,
+                        const int64_t n_classes,
+                        const std::string& reduction,
+                        const int64_t ignore_index) {
+  if (reduction == "none") {
+    for (int64_t i = 0; i < batch_size; ++i) {
+      const auto cur_label = label_data[i];
+      if (cur_label == ignore_index) {
+        out_data[i] = 0;
+        continue;
+      }
+      PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes,
+                        true,
+                        phi::errors::InvalidArgument(
+                            "Label value is out of range. "
+                            "Expected label value in range of [0, %d), but "
+                            "received value is %d.",
+                            n_classes,
+                            cur_label));
+
+      const auto cur_weight =
+          weight_data ? weight_data[cur_label] : static_cast<T>(1);
+      out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight;
+    }
+    return;
+  }
+
+  T output_val = 0;
+  T total_weight_val = 0;
+
+  for (int64_t i = 0; i < batch_size; i++) {
+    const auto cur_label = label_data[i];
+    if (cur_label == ignore_index) {
+      out_data[i] = 0;
+      continue;
+    }
+    PADDLE_ENFORCE_EQ(
+        cur_label >= 0 && cur_label < n_classes,
+        true,
+        phi::errors::InvalidArgument("label should not be out of bounds."));
+
+    const auto cur_weight =
+        weight_data ? weight_data[cur_label] : static_cast<T>(1);
+    total_weight_val += cur_weight;
+    output_val -= x_data[i * n_classes + cur_label] * cur_weight;
+  }
+  if (reduction == "mean" && total_weight_val != 0) {
+    output_val /= total_weight_val;
+  }
+  *out_data = output_val;
+  *total_weight_data = total_weight_val;
+}
+
+template <typename T>
+static void nll_loss_2D(T* out_data,
+                        T* total_weight_data,
+                        const T* x_data,
+                        const int64_t* label_data,
+                        const T* weight_data,
+                        const int64_t batch_size,
+                        const int64_t n_classes,
+                        const int64_t in_dim2,
+                        const int64_t in_dim3,
+                        const std::string& reduction,
+                        const int64_t ignore_index) {
+  const auto map_size = in_dim2 * in_dim3;
+  const auto sample_size = n_classes * map_size;
+  if (reduction == "none") {
+    for (int i = 0; i < batch_size; i++) {
+      for (int h = 0; h < in_dim2; h++) {
+        for (int w = 0; w < in_dim3; w++) {
+          const auto index = i * map_size + h * in_dim3 + w;
+          const auto cur_label = label_data[index];
+          if (cur_label == ignore_index) {
+            out_data[index] = 0;
+            continue;
+          }
+          PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes,
+                            true,
+                            phi::errors::InvalidArgument(
+                                "label should not be out of bounds."));
+          const auto cur_weight =
+              weight_data ? weight_data[cur_label] : static_cast<T>(1);
+          out_data[index] = -x_data[i * sample_size + cur_label * map_size +
+                                    h * in_dim3 + w] *
+                            cur_weight;
+        }
+      }
+    }
+    return;
+  }
+
+  T output_val = 0;
+  T total_weight_val = 0;
+
+  for (int i = 0; i < batch_size; i++) {
+    for (int h = 0; h < in_dim2; h++) {
+      for (int w = 0; w < in_dim3; w++) {
+        const auto index = i * map_size + h * in_dim3 + w;
+        const auto cur_label = label_data[index];
+        if (cur_label == ignore_index) {
+          out_data[index] = 0;
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(
+            cur_label >= 0 && cur_label < n_classes,
+            true,
+            phi::errors::InvalidArgument("label should not be out of bounds."));
+        const auto cur_weight =
+            weight_data ? weight_data[cur_label] : static_cast<T>(1);
+        total_weight_val += cur_weight;
+        output_val -=
+            x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] *
+            cur_weight;
+      }
+    }
+  }
+
+  if (reduction == "mean" && total_weight_val != 0) {
+    output_val /= total_weight_val;
+  }
+  *out_data = output_val;
+  *total_weight_data = total_weight_val;
+}
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& labels,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight) {
+  auto x_data = x.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  auto total_weight_data = dev_ctx.template Alloc<T>(total_weight);
+  *total_weight_data = 0;
+
+  auto x_dims = x.dims();
+  const auto batch_size = x_dims[0];
+  const auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    nll_loss_1D<T>(out_data,
+                   total_weight_data,
+                   x_data,
+                   label_data,
+                   weight_data,
+                   batch_size,
+                   n_classes,
+                   reduction,
+                   ignore_index);
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    nll_loss_2D<T>(out_data,
+                   total_weight_data,
+                   x_data,
+                   label_data,
+                   weight_data,
+                   batch_size,
+                   n_classes,
+                   in_dim2,
+                   in_dim3,
+                   reduction,
+                   ignore_index);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss, CPU, ALL_LAYOUT, phi::NllLossRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
new file mode 100644
index 00000000000000..e94d09e0337f27
--- /dev/null
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("PutAlongAxisGradOpKernel only runs on CPU."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (x_grad) {
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_input_grad_kernel<T, int32_t>(
+          // Here passing an unused argument out_grad, because it's
+          // convenient to instantiate a bunch of template function with the
+          // same arguments list.
+          out_grad,
+          axis,
+          index,
+          *x_grad,
+          dev_ctx);
+    } else {
+      paddle::operators::cpu_scatter_input_grad_kernel<T, int64_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    }
+  }
+
+  if (value_grad) {
+    value_grad->Resize(index.dims());
+    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_gather_kernel<T, int32_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_gather_kernel<T, int64_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisGradKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
new file mode 100644
index 00000000000000..83c9a915ee6357
--- /dev/null
+++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("PutAlongAxisOpKernel only runs on CPU."));
+
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (reduce == "add") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_add_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_add_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "multiply" || reduce == "mul") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_mul_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_mul_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "assign") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::cpu_scatter_assign_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::cpu_scatter_assign_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "can not support reduce: '%s' for scatter kernel, only "
+        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "defalut reduce "
+        "op is 'assign' ",
+        reduce));
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/cpu/reduce_max_kernel.cc
new file mode 100644
index 00000000000000..f9ea0aa0faf069
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_max_kernel.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
new file mode 100644
index 00000000000000..585c27bdcec97e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
new file mode 100644
index 00000000000000..d0413457f81773
--- /dev/null
+++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/segment_pool_kernel.h"
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/set_value_kernel.cc b/paddle/phi/kernels/cpu/set_value_kernel.cc
new file mode 100644
index 00000000000000..dcf278cd94e651
--- /dev/null
+++ b/paddle/phi/kernels/cpu/set_value_kernel.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
+PD_REGISTER_KERNEL(set_value_with_tensor,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetTensorValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/cpu/shape_kernel.cc b/paddle/phi/kernels/cpu/shape_kernel.cc
new file mode 100644
index 00000000000000..073dc63b2a4348
--- /dev/null
+++ b/paddle/phi/kernels/cpu/shape_kernel.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shape_kernel.h"
+#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(shape,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
new file mode 100644
index 00000000000000..4443383f40262e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on CPU."));
+
+  // We need to know the shape of input matrix to determine the shape of grad
+  // matrix of input.
+  x_grad->Resize(x.dims());
+  dev_ctx.template Alloc<T>(x_grad);
+
+  // Set to zero tensor.
+  phi::funcs::SetConstant<Context, T> functor;
+  functor(dev_ctx, x_grad, static_cast<T>(0));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::cpu_scatter_add_kernel<T, int32_t>(
+        *x_grad,
+        axis,
+        index,
+        out_grad,
+        dev_ctx);  // the gradient of gather is scatter
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::cpu_scatter_add_kernel<T, int64_t>(
+        *x_grad, axis, index, out_grad, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisGradKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
new file mode 100644
index 00000000000000..502db8a22da0bc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on CPU."));
+
+  out->Resize(index.dims());
+  dev_ctx.template Alloc<T>(out);
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::cpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, dev_ctx);
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::cpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/top_k_grad_kernel.cc b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
new file mode 100644
index 00000000000000..582ee1157cce8b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullTopKAssign(const Type& input_height,
+                           const Type& input_width,
+                           const int& input_dim,
+                           const DenseTensor* input,
+                           const DenseTensor* indices,
+                           T* output_data,
+                           const int& k) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(j)] = e_input(j);
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      for (Type j = 0; j < k; ++j) {
+        output_data[i * input_width + e_indices(i, j)] = e_input(i, j);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  const auto& out_dims = indices.dims();
+
+  // axis < 0, get the real axis
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  if (axis + 1 == in_dims.size()) {
+    // allocate the memory for the input_grad
+
+    // assign the out_grad to input_grad directly
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    // init the output grad with 0, because some input elements has no grad
+    memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+    // Assign the output_grad to input_grad
+    FullTopKAssign(input_height,
+                   input_width,
+                   in_dims.size(),
+                   &out_grad,
+                   &indices,
+                   x_grad_data,
+                   k);
+  } else {
+    // can not assign grad to input_grad, must do the transpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    phi::DDim trans_dims(out_dims);
+    phi::DDim trans_in_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = out_dims[trans[i]];
+      trans_in_dims[i] = in_dims[trans[i]];
+    }
+    // transpose the out_grad, indices
+    DenseTensor trans_dO;
+    DenseTensor trans_ind;
+    trans_dO.Resize(trans_dims);
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    int ndims = trans.size();
+
+    // Do transpose
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, out_grad, &trans_dO, trans);
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, indices, &trans_ind, trans);
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+    const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+
+    // Assign the out_grad to tranpose input_grad
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+    FullTopKAssign<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &trans_dO,
+                               &trans_ind,
+                               t_out,
+                               k);
+
+    // Transpose back
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, x_grad, trans);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TopkGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/top_k_kernel.cc b/paddle/phi/kernels/cpu/top_k_kernel.cc
new file mode 100644
index 00000000000000..4ac16667ce2741
--- /dev/null
+++ b/paddle/phi/kernels/cpu/top_k_kernel.cc
@@ -0,0 +1,230 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Type>
+static void FullTopK(Type input_height,
+                     Type input_width,
+                     int input_dim,
+                     const DenseTensor* input,
+                     T* t_out,
+                     Type* t_indices,
+                     const int& k,
+                     const bool& largest,
+                     const bool& sorted) {
+  // when the k is small, will the partial sort
+  bool partial_sort_flag = (k * 64) < input_width;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(),
+          col_vec.begin() + k,
+          col_vec.end(),
+          [&largest](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            if (largest) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            } else {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            }
+          });
+    } else {
+      // use the nth-element to get the K-larger or K-small element
+      if (largest) {
+        std::nth_element(
+            col_vec.begin(),
+            col_vec.begin() + k - 1,
+            col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (std::isnan(static_cast<double>(l.first)) &&
+                      !std::isnan(static_cast<double>(r.first))) ||
+                     (l.first > r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(col_vec.begin(),
+                    col_vec.begin() + k - 1,
+                    [&largest](const std::pair<T, Type>& l,
+                               const std::pair<T, Type>& r) {
+                      return (std::isnan(static_cast<double>(l.first)) &&
+                              !std::isnan(static_cast<double>(r.first))) ||
+                             (l.first > r.first);
+                    });
+        }
+      } else {
+        std::nth_element(
+            col_vec.begin(),
+            col_vec.begin() + k - 1,
+            col_vec.end(),
+            [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+              return (!std::isnan(static_cast<double>(l.first)) &&
+                      std::isnan(static_cast<double>(r.first))) ||
+                     (l.first < r.first);
+            });
+        // the nth-element will get the unorder elements, sort the element
+        if (sorted) {
+          std::sort(
+              col_vec.begin(),
+              col_vec.begin() + k - 1,
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+        }
+      }
+    }
+    for (Type j = 0; j < k; ++j) {
+      t_out[i * k + j] = col_vec[j].first;
+      t_indices[i * k + j] = col_vec[j].second;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto* input = &x;
+  // Get the top k elements of each row of input tensor
+  const auto& in_dims = input->dims();
+
+  // axis < 0, cacluate the real axis
+  if (axis < 0) {
+    axis += in_dims.size();
+  }
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    auto out_dims = out->dims();
+    // accroding to axis to set K value in the dim
+    out_dims[axis] = k;
+    out->Resize(out_dims);
+    indices->Resize(out_dims);
+  }
+
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  const auto& out_dims = out->dims();
+  if (axis + 1 == in_dims.size()) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    FullTopK<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         input,
+                         out_data,
+                         indices_data,
+                         k,
+                         largest,
+                         sorted);
+  } else {
+    // if the topk dims is not last dim, will tranpose and do topk
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+
+    // get the trans input_dims, out_dims
+    phi::DDim trans_dims(in_dims);
+    phi::DDim trans_out_dims(out->dims());
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_out_dims[i] = out_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    int ndims = trans.size();
+
+    // transpose the input value
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, *input, &trans_inp, trans);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    // Allocate the temp tensor to the save the topk indices, values
+    DenseTensor tmp_out;
+    DenseTensor tmp_indices;
+    tmp_out.Resize(trans_out_dims);
+    tmp_indices.Resize(trans_out_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    auto* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    // get the TopK value
+    FullTopK<T, int64_t>(input_height,
+                         input_width,
+                         in_dims.size(),
+                         &trans_inp,
+                         t_out,
+                         t_ind,
+                         k,
+                         largest,
+                         sorted);
+    // transpose back
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, out, trans);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    top_k, CPU, ALL_LAYOUT, phi::TopkKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc
new file mode 100644
index 00000000000000..80b2015f7318ad
--- /dev/null
+++ b/paddle/phi/kernels/cpu/triangular_solve_grad_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(triangular_solve_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
new file mode 100644
index 00000000000000..5aca5be1279238
--- /dev/null
+++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int y_bst_ndim = y_bst_dims_vec.size();
+
+  // Tensor broadcast to 'out' and temp 'x_bst'
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  const T* x_bst_data = x_bst.data<T>();
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  out->Resize(phi::make_ddim(y_bst_dims_vec));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, out);
+
+  // Calculate use blas library
+  int M = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 2]);
+  int N = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 1]);
+  int batch_size = 1;
+  for (int i = 0; i < x_bst_ndim - 2; i++) {
+    batch_size *= x_bst_dims_vec[i];
+  }
+
+  auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
+  for (int i = 0; i < batch_size; i++) {
+    blas.TRSM(CblasLeft,
+              upper ? CblasUpper : CblasLower,
+              transpose ? CblasTrans : CblasNoTrans,
+              unitriangular ? CblasUnit : CblasNonUnit,
+              M,
+              N,
+              T(1),
+              x_bst_data + i * M * M,
+              std::max(1, M),
+              out_data + i * N * M,
+              std::max(1, N));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(triangular_solve,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
index ebc032ef545381..4247e597acef4a 100644
--- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
@@ -27,7 +27,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void TruncatedGaussianRandomKernel(const Context& dev_ctx,
-                                   const ScalarArray& shape,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
diff --git a/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
new file mode 100644
index 00000000000000..fab49f5416048a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/viterbi_decode_kernel.cc
@@ -0,0 +1,319 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/viterbi_decode_kernel.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/viterbi_decode_functor.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndType>
+struct Argmax {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* out_idx,
+                  DenseTensor* out,
+                  int axis) {
+    phi::DDim input_dims = input.dims();
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+    int64_t height = pre * post;
+    int64_t width = n;
+    const T* in_data = input.data<T>();
+    IndType* out_idx_data = out_idx->data<IndType>();
+    T* out_data = out->data<T>();
+// Reduce
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+    for (int64_t i = 0; i < height; ++i) {
+      int64_t h = i / post;
+      int64_t w = i % post;
+      IndType max_idx = -1;
+      T max_value = (std::numeric_limits<T>::lowest)();  // for windows compile
+      for (int64_t j = 0; j < width; ++j) {
+        if (in_data[h * width * post + j * post + w] > max_value) {
+          max_value = in_data[h * width * post + j * post + w];
+          max_idx = j;
+        }
+      }
+      out_data[i] = max_value;
+      out_idx_data[i] = max_idx;
+    }
+  }
+};
+
+template <typename Context>
+struct ARange {
+  void operator()(const Context& dev_ctx,
+                  int64_t* data,
+                  int end,
+                  int64_t scale) {
+    for (int i = 0; i < end; ++i) {
+      data[i] = i * scale;
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct GetMaxValue {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  T* max_value) {
+    auto input_ptr = input.data<T>();
+    auto num = input.numel();
+    *max_value = *std::max_element(input_ptr, input_ptr + num);
+  }
+};
+
+template <typename Context, typename T, typename IndexT = int>
+struct Gather {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& src,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
+    phi::funcs::CPUGather<T, IndexT>(dev_ctx, src, index, output);
+  }
+};
+
+template <typename Context,
+          template <typename InT, typename OutT> typename CompareFunctor,
+          typename T>
+struct GetMask {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* mask) {
+    funcs::SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(
+        lhs, rhs, mask);
+  }
+};
+
+template <typename Context,
+          template <typename T> typename BinaryFunctor,
+          typename T>
+struct BinaryOperation {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* output) {
+    if (lhs.dims() == rhs.dims()) {
+      funcs::SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
+    } else {
+      bool is_multi_threads = false;
+#ifdef PADDLE_WITH_MKLML
+      if (omp_get_max_threads() > 1) {
+        is_multi_threads = true;
+      }
+#endif
+      if (is_multi_threads) {
+        funcs::SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, true>(
+            lhs, rhs, output);
+      } else {
+        funcs::SimpleBroadcastBinaryOP<T, BinaryFunctor<T>, false>(
+            lhs, rhs, output);
+      }
+    }
+  }
+};
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path) {
+  auto curr_place = dev_ctx.GetPlace();
+  auto batch_size = static_cast<int>(input.dims()[0]);
+  auto seq_len = static_cast<int>(input.dims()[1]);
+  auto n_labels = static_cast<int>(input.dims()[2]);
+  phi::funcs::SetConstant<Context, T> float_functor;
+  phi::funcs::SetConstant<Context, int64_t> int_functor;
+  std::vector<DenseTensor> historys;
+  // We create tensor buffer in order to avoid allocating memory frequently
+  // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
+  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer int_tensor_buffer(int_buffer);
+  // create float tensor buffer
+  // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
+  buffer_size = batch_size * (seq_len + 10) * n_labels +
+                (batch_size + 2) * n_labels * n_labels;
+  DenseTensor float_buffer = Empty<T>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer float_tensor_buffer(float_buffer);
+  DenseTensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  phi::Copy(dev_ctx, length, curr_place, false, &left_length);
+  int64_t max_seq_len = 0;
+  GetMaxValue<Context, int64_t> get_max_value;
+  get_max_value(dev_ctx, left_length, &max_seq_len);
+  dev_ctx.template Alloc<T>(scores);
+  path->Resize({batch_size, max_seq_len});
+  dev_ctx.template Alloc<int64_t>(path);
+  DenseTensor tpath =
+      int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+  auto batch_path = funcs::Unbind(tpath);
+  for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
+    it->Resize({batch_size});
+  }
+  // create and init required tensor
+  DenseTensor input_exp =
+      float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  TransposeKernel<T, Context>(dev_ctx, input, {1, 0, 2}, &input_exp);
+  DenseTensor trans_exp =
+      float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+  phi::Copy(dev_ctx, transition, curr_place, false, &trans_exp);
+  trans_exp.Resize({1, n_labels, n_labels});
+  DenseTensor alpha =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &zero, 0);
+  DenseTensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &one, 1);
+  DenseTensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
+  DenseTensor alpha_trn_sum =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
+  DenseTensor alpha_max =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor alpha_argmax =
+      int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  auto alpha_argmax_unbind = funcs::Unbind(alpha_argmax);
+  DenseTensor alpha_nxt =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  DenseTensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor start_trans =
+      float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor rest_trans =
+      float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
+  DenseTensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
+  std::vector<const DenseTensor*> shape{&rest_trans, &stop_trans, &start_trans};
+  std::vector<DenseTensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+  phi::funcs::SplitFunctor<Context, T> split_functor;
+  split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
+  stop_trans.Resize({1, n_labels});
+  start_trans.Resize({1, n_labels});
+  auto logit0 = input_exp.Slice(0, 1);
+  logit0.Resize({batch_size, n_labels});
+  BinaryOperation<Context, phi::funcs::AddFunctor, T> AddFloat;
+  BinaryOperation<Context, phi::funcs::AddFunctor, int64_t> AddInt;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, T> MulFloat;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, int64_t> MulInt;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, T> SubFloat;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, int64_t> SubInt;
+  if (include_bos_eos_tag) {
+    AddFloat(dev_ctx, logit0, start_trans, &alpha);
+    GetMask<Context, phi::funcs::EqualFunctor, T>()(
+        dev_ctx, left_length, one, &float_mask);
+    MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+  } else {
+    alpha = logit0;
+  }
+  SubInt(dev_ctx, left_length, one, &left_length);
+  Argmax<Context, T, int64_t> argmax;
+  for (int64_t i = 1; i < max_seq_len; ++i) {
+    DenseTensor logit = input_exp.Slice(i, i + 1);
+    logit.Resize({batch_size, n_labels});
+    DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+    AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
+    auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
+    alpha_argmax_temp.Resize({batch_size, n_labels});
+    argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
+    historys.emplace_back(alpha_argmax_temp);
+    AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
+    alpha.Resize({batch_size, n_labels});
+    GetMask<Context, phi::funcs::GreaterThanFunctor, T>()(
+        dev_ctx, left_length, zero, &float_mask);
+    MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
+    SubFloat(dev_ctx, float_one, float_mask, &float_mask);
+    MulFloat(dev_ctx, alpha, float_mask, &alpha);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    if (include_bos_eos_tag) {
+      GetMask<Context, phi::funcs::EqualFunctor, T>()(
+          dev_ctx, left_length, one, &float_mask);
+      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    }
+    SubInt(dev_ctx, left_length, one, &left_length);
+  }
+  argmax(dev_ctx, alpha, &last_ids, scores, 1);
+  left_length.Resize({batch_size});
+  GetMask<Context, phi::funcs::GreaterEqualFunctor, int64_t>()(
+      dev_ctx, left_length, zero, &int_mask);
+  // last_ids_update = last_ids * tag_mask
+  int last_ids_index = 1;
+  int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
+  MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
+  // The algorithm below can refer to
+  // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
+  ARange<Context> arange;
+  arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
+  Gather<Context, int64_t, int64_t> gather;
+  for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
+    ++last_ids_index;
+    AddInt(dev_ctx, left_length, one, &left_length);
+    AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
+    DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index];
+    hist->Resize({batch_size * n_labels});
+    gather(dev_ctx, *hist, gather_idx, &last_ids_update);
+    GetMask<Context, phi::funcs::GreaterThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
+    GetMask<Context, phi::funcs::EqualFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &zero_len_mask);
+    MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
+    SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
+    MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
+    AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
+    GetMask<Context, phi::funcs::LessThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids, int_mask, &last_ids);
+    AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
+  }
+  TransposeKernel<int64_t, Context>(dev_ctx, tpath, {1, 0}, path);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    viterbi_decode, CPU, ALL_LAYOUT, phi::ViterbiDecodeKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/where_index_kernel.cc b/paddle/phi/kernels/cpu/where_index_kernel.cc
new file mode 100644
index 00000000000000..da6eff74011eaa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/where_index_kernel.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+struct WhereIndexFunctor {
+  WhereIndexFunctor(
+      const T* true_index, int true_num, const T* stride, int rank, T* out)
+      : true_index_(true_index),
+        true_num_(true_num),
+        stride_(stride),
+        rank_(rank),
+        out_ptr_(out) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    T index = true_index_[idx];
+    for (int j = 0; j < rank_; j++) {
+      out_ptr_[idx * rank_ + j] = index / stride_[j];
+      index -= out_ptr_[idx * rank_ + j] * stride_[j];
+    }
+  }
+
+  const T* true_index_;
+  int true_num_;
+  const T* stride_;
+  int rank_;
+  T* out_ptr_;
+};
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out) {
+  const T* cond_data = condition.data<T>();
+  auto numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  std::vector<int64_t> true_index;
+  for (auto i = 0; i < numel; i++) {
+    if (static_cast<bool>(cond_data[i])) {
+      true_index.push_back(i);
+    }
+  }
+  auto true_num = true_index.size();
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
+  auto* out_ptr = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num == 0) {
+    return;
+  }
+
+  std::vector<int64_t> stride(rank);
+  stride[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    stride[i] = stride[i + 1] * dims[i + 1];
+  }
+
+  WhereIndexFunctor<int64_t> functor(
+      true_index.data(), true_num, stride.data(), rank, out_ptr);
+  phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx, true_num);
+  for_range(functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_index,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::WhereIndexKernel,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/yolo_box_kernel.cc b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
new file mode 100644
index 00000000000000..a83bc019fc3af3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/yolo_box_kernel.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/yolo_box_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/yolo_box_util.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores) {
+  auto* input = &x;
+  auto* imgsize = &img_size;
+  float scale = scale_x_y;
+  float bias = -0.5 * (scale - 1.);
+
+  const int n = input->dims()[0];
+  const int h = input->dims()[2];
+  const int w = input->dims()[3];
+  const int box_num = boxes->dims()[1];
+  const int an_num = anchors.size() / 2;
+  int input_size_h = downsample_ratio * h;
+  int input_size_w = downsample_ratio * w;
+
+  const int stride = h * w;
+  const int an_stride = (class_num + 5) * stride;
+
+  DenseTensor anchors_;
+  auto anchors_data =
+      anchors_.mutable_data<int>({an_num * 2}, dev_ctx.GetPlace());
+  std::copy(anchors.begin(), anchors.end(), anchors_data);
+
+  const T* input_data = input->data<T>();
+  const int* imgsize_data = imgsize->data<int>();
+  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
+  memset(boxes_data, 0, boxes->numel() * sizeof(T));
+  T* scores_data =
+      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+  memset(scores_data, 0, scores->numel() * sizeof(T));
+
+  T box[4];
+  for (int i = 0; i < n; i++) {
+    int img_height = imgsize_data[2 * i];
+    int img_width = imgsize_data[2 * i + 1];
+
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          int obj_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 4, iou_aware);
+          T conf = funcs::sigmoid<T>(input_data[obj_idx]);
+          if (iou_aware) {
+            int iou_idx =
+                funcs::GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
+            T iou = funcs::sigmoid<T>(input_data[iou_idx]);
+            conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+                   pow(iou, static_cast<T>(iou_aware_factor));
+          }
+          if (conf < conf_thresh) {
+            continue;
+          }
+
+          int box_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 0, iou_aware);
+          funcs::GetYoloBox<T>(box,
+                               input_data,
+                               anchors_data,
+                               l,
+                               k,
+                               j,
+                               h,
+                               w,
+                               input_size_h,
+                               input_size_w,
+                               box_idx,
+                               stride,
+                               img_height,
+                               img_width,
+                               scale,
+                               bias);
+          box_idx = (i * box_num + j * stride + k * w + l) * 4;
+          funcs::CalcDetectionBox<T>(
+              boxes_data, box, box_idx, img_height, img_width, clip_bbox);
+
+          int label_idx = funcs::GetEntryIndex(
+              i, j, k * w + l, an_num, an_stride, stride, 5, iou_aware);
+          int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
+          funcs::CalcLabelScore<T>(scores_data,
+                                   input_data,
+                                   label_idx,
+                                   score_idx,
+                                   class_num,
+                                   conf,
+                                   stride);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    yolo_box, CPU, ALL_LAYOUT, phi::YoloBoxKernel, float, double) {}
diff --git a/paddle/phi/kernels/dist_grad_kernel.h b/paddle/phi/kernels/dist_grad_kernel.h
new file mode 100644
index 00000000000000..1f8d7ff21f2fe4
--- /dev/null
+++ b/paddle/phi/kernels/dist_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DistGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out,
+                    const DenseTensor& out_grad,
+                    float p,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dist_kernel.h b/paddle/phi/kernels/dist_kernel.h
new file mode 100644
index 00000000000000..6cb3d6e0e8bef3
--- /dev/null
+++ b/paddle/phi/kernels/dist_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DistKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                float p,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dot_kernel.h b/paddle/phi/kernels/dot_kernel.h
index 9377fba204bea4..9c7703440d8aee 100644
--- a/paddle/phi/kernels/dot_kernel.h
+++ b/paddle/phi/kernels/dot_kernel.h
@@ -29,7 +29,7 @@ template <typename T, typename Context>
 DenseTensor Dot(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DotInferMeta(x, y, &meta_out);
   DotKernel<T, Context>(dev_ctx, x, y, &dense_out);
diff --git a/paddle/phi/kernels/dropout_grad_kernel.h b/paddle/phi/kernels/dropout_grad_kernel.h
new file mode 100644
index 00000000000000..ae3f82056632dd
--- /dev/null
+++ b/paddle/phi/kernels/dropout_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/dropout_kernel.h b/paddle/phi/kernels/dropout_kernel.h
new file mode 100644
index 00000000000000..dc9f89e08e17ac
--- /dev/null
+++ b/paddle/phi/kernels/dropout_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/eigh_grad_kernel.h b/paddle/phi/kernels/eigh_grad_kernel.h
new file mode 100644
index 00000000000000..73df76e676a8b4
--- /dev/null
+++ b/paddle/phi/kernels/eigh_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighGardKernel(const Context& dev_ctx,
+                    const DenseTensor& out_w,
+                    const DenseTensor& out_v,
+                    const DenseTensor& dout_w,
+                    const DenseTensor& dout_v,
+                    DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/eigh_kernel.h b/paddle/phi/kernels/eigh_kernel.h
new file mode 100644
index 00000000000000..dd28752d929834
--- /dev/null
+++ b/paddle/phi/kernels/eigh_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index a1b296e326f219..58ae11a9c4256d 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -64,4 +64,64 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
                               int axis,
                               DenseTensor* ddout);
 
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy);
+
+template <typename T, typename Context>
+void DivideDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dx,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            int axis,
+                            DenseTensor* dy,
+                            DenseTensor* dout,
+                            DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy);
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/empty_kernel.h b/paddle/phi/kernels/empty_kernel.h
index 0b8d95ee94fb54..f66f4419fd7f58 100644
--- a/paddle/phi/kernels/empty_kernel.h
+++ b/paddle/phi/kernels/empty_kernel.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
 
@@ -34,28 +34,17 @@ void EmptyLikeKernel(const Context& dev_ctx,
                      DataType dtype,
                      DenseTensor* out);
 
-// TODO(chenweihang): the tensor creation method need to be replaced later,
-// all kernel api call Empty here instead of making tensor self
 template <typename Context>
 DenseTensor Empty(const Context& dev_ctx, DenseTensorMeta&& meta) {
-  phi::DenseTensor dense_out(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(meta));
+  phi::DenseTensor dense_out;
+  dense_out.set_meta(meta);
+  dev_ctx.Alloc(&dense_out, dense_out.dtype());
   return dense_out;
 }
 
-template <typename T, typename Context>
-DenseTensor Empty(const Context& dev_ctx) {
-  return Empty(dev_ctx,
-               {paddle::experimental::CppTypeToDataType<T>::Type(),
-                {-1},
-                DataLayout::NCHW});
-}
-
 template <typename T, typename Context>
 DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
@@ -65,7 +54,7 @@ DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) {
 
 template <typename T, typename Context>
 DenseTensor EmptyLike(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
diff --git a/paddle/phi/kernels/flatten_kernel.h b/paddle/phi/kernels/flatten_kernel.h
index de57dcf2e8d3a0..808af7d9b7beed 100644
--- a/paddle/phi/kernels/flatten_kernel.h
+++ b/paddle/phi/kernels/flatten_kernel.h
@@ -40,7 +40,7 @@ DenseTensor Flatten(const Context& dev_ctx,
                     const DenseTensor& x,
                     int start_axis,
                     int stop_axis) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   FlattenInferMeta(x, start_axis, stop_axis, &meta_out);
   FlattenKernel<T, Context>(dev_ctx, x, start_axis, stop_axis, &dense_out);
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index c7b1f9af0e3191..41fc96b6db1fae 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -17,7 +17,6 @@
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
 
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -31,13 +30,6 @@ void FullKernel(const Context& dev_ctx,
                 DataType dtype,
                 DenseTensor* out);
 
-template <typename T, typename Context>
-void FullSR(const Context& dev_ctx,
-            const ScalarArray& shape,
-            const Scalar& val,
-            DataType dtype,
-            SelectedRows* out);
-
 template <typename T, typename Context>
 void FullLikeKernel(const Context& dev_ctx,
                     const DenseTensor& x,
@@ -45,11 +37,23 @@ void FullLikeKernel(const Context& dev_ctx,
                     DataType dtype,
                     DenseTensor* out);
 
+template <typename T, typename Context>
+void Full(const Context& dev_ctx,
+          const ScalarArray& shape,
+          const Scalar& val,
+          DenseTensor* out) {
+  FullKernel<T, Context>(dev_ctx,
+                         shape,
+                         val,
+                         paddle::experimental::CppTypeToDataType<T>::Type(),
+                         out);
+}
+
 template <typename T, typename Context>
 DenseTensor Full(const Context& dev_ctx,
                  const ScalarArray& shape,
                  const Scalar& val) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateInferMeta(shape, dtype, &meta_out);
@@ -61,7 +65,7 @@ template <typename T, typename Context>
 DenseTensor FullLike(const Context& dev_ctx,
                      const DenseTensor& x,
                      const Scalar& val) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   DataType dtype = paddle::experimental::CppTypeToDataType<T>::Type();
   CreateLikeInferMeta(x, dtype, &meta_out);
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 8b8697b6df12cf..e0db7b51f8e04b 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -4,7 +4,10 @@ add_subdirectory(lapack)
 add_subdirectory(detail)
 
 math_library(math_function DEPS blas dense_tensor tensor)
+math_library(segment_pooling)
 math_library(sequence2batch)
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
 math_library(concat_and_split_functor DEPS dense_tensor)
+math_library(matrix_reduce DEPS dense_tensor)
+math_library(matrix_inverse DEPS dense_tensor eigen3 blas)
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
new file mode 100644
index 00000000000000..1a36e4e132f417
--- /dev/null
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -0,0 +1,830 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <cmath>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
+#include <type_traits>
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+namespace funcs {
+enum ActBwdOpFwdDeps {
+  kNoDeps = 0x00,  // Do not need any forward input/output
+  kDepX = 0x01,    // Only need forward input X
+  kDepOut = 0x02,  // Only need forward output Out
+};
+
+template <typename T>
+struct BaseActivationFunctor {
+  using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
+};
+
+template <typename T>
+struct Sine {
+  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
+};
+
+template <>
+struct Sine<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(sin(static_cast<float>(val)));
+  }
+};
+
+template <typename T>
+struct Cosine {
+  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
+};
+
+template <>
+struct Cosine<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(cos(static_cast<float>(val)));
+  }
+};
+
+// sine'(x) = cos(x)
+template <typename T>
+struct SinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// sine(x) = sin(x)
+template <typename T>
+struct SinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sine<T>());
+  }
+};
+
+// cosine'(x) = -sin(x)
+template <typename T>
+struct CosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// cosine(x) = cos(x)
+template <typename T>
+struct CosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosine<T>());
+  }
+};
+
+template <typename T>
+struct Tangent {
+  HOSTDEVICE T operator()(const T& val) const { return tan(val); }
+};
+
+template <>
+struct Tangent<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(tan(static_cast<float>(val)));
+  }
+};
+
+// Tangent'(x) = -Tangent(x)
+template <typename T>
+struct TanGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout / x.unaryExpr(Cosine<T>()).square();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// Tangent(x) = tan(x)
+template <typename T>
+struct TanFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Tangent<T>());
+  }
+};
+
+template <typename T>
+struct Sinh {
+  HOSTDEVICE T operator()(const T& val) const { return sinh(val); }
+};
+
+template <>
+struct Sinh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(sinhf(static_cast<float>(val)));
+  }
+};
+
+template <typename T>
+struct Cosh {
+  HOSTDEVICE T operator()(const T& val) const { return cosh(val); }
+};
+
+template <>
+struct Cosh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(coshf(static_cast<float>(val)));
+  }
+};
+
+// sinh(x) = sinh(x)
+template <typename T>
+struct SinhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sinh<T>());
+  }
+};
+
+// cosh(x) = cosh(x)
+template <typename T>
+struct CoshFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosh<T>());
+  }
+};
+
+// sinh'(x) = cosh(x)
+template <typename T>
+struct SinhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosh<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// cosh'(x) = sinh(x)
+template <typename T>
+struct CoshGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Sinh<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Acos {
+  HOSTDEVICE T operator()(const T& val) const { return acos(val); }
+};
+
+template <>
+struct Acos<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(acos(static_cast<float>(val)));
+  }
+};
+
+// Acos(x) = acos(x)
+template <typename T>
+struct AcosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Acos<T>());
+  }
+};
+
+// acos'(x) = -1/sqrt(1-x^2)
+template <typename T>
+struct AcosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Asin {
+  HOSTDEVICE T operator()(const T& val) const { return asin(val); }
+};
+
+template <>
+struct Asin<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(asin(static_cast<float>(val)));
+  }
+};
+
+// Asin(x) = asin(x)
+template <typename T>
+struct AsinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Asin<T>());
+  }
+};
+
+// asin'(x) = 1/sqrt(1-x^2)
+template <typename T>
+struct AsinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Atan {
+  HOSTDEVICE T operator()(const T& val) const { return atan(val); }
+};
+
+template <>
+struct Atan<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(atan(static_cast<float>(val)));
+  }
+};
+
+// Atan(x) = atan(x)
+template <typename T>
+struct AtanFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Atan<T>());
+  }
+};
+
+// atan'(x) =  1 / (1 + x^2)
+template <typename T>
+struct AtanGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Acosh {
+  HOSTDEVICE T operator()(const T& val) const { return acosh(val); }
+};
+
+template <>
+struct Acosh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(acosh(static_cast<float>(val)));
+  }
+};
+
+// Acosh(x) = acosh(x)
+template <typename T>
+struct AcoshFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Acosh<T>());
+  }
+};
+
+// acosh'(x) =  1/sqrt(x^2 - 1)
+template <typename T>
+struct AcoshGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (x * x - static_cast<T>(1)).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Asinh {
+  HOSTDEVICE T operator()(const T& val) const { return asinh(val); }
+};
+
+template <>
+struct Asinh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(asinh(static_cast<float>(val)));
+  }
+};
+
+// Asinh(x) = asinh(x)
+template <typename T>
+struct AsinhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Asinh<T>());
+  }
+};
+
+// asinh'(x) =  1/sqrt(x^2 + 1)
+template <typename T>
+struct AsinhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<T>(1) / (x.square() + static_cast<T>(1)).sqrt();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct Atanh {
+  HOSTDEVICE T operator()(const T& val) const { return atanh(val); }
+};
+
+template <>
+struct Atanh<dtype::float16> {
+  HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const {
+    return dtype::float16(atanh(static_cast<float>(val)));
+  }
+};
+
+// Atanh(x) = atanh(x)
+template <typename T>
+struct AtanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Atanh<T>());
+  }
+};
+
+// atanh'(x) =  1/(1 - x^2)
+template <typename T>
+struct AtanhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) - x.square());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+// relu(x) = max(x, 0)
+template <typename T>
+struct ReluCPUFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) {
+      return v > static_cast<T>(0) ? v : static_cast<T>(0);
+    });
+  }
+};
+
+template <typename T>
+struct ReluCUDAFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+struct ReluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad"));
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad"));
+      ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+struct CudaReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // relu(x) = max(x, 0)
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > zero ? x : zero;
+  }
+};
+
+template <typename T>
+struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // dx = dout * (out > 0)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return out > zero ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaCosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cos(x) = cos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cos(x));
+  }
+};
+
+template <typename T>
+struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * (-sin(x))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout * sin(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sin(x) = sin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sin(x));
+  }
+};
+
+template <typename T>
+struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cos(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cos(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tan(x) = tan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(tan(x));
+  }
+};
+
+template <typename T>
+struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout / cos(x)^2
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / (cos(x) * cos(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // asin(x) = asin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asin(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // acos(x) = acos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acos(x));
+  }
+};
+
+template <typename T>
+struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = -dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaCoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cosh(x) = cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cosh(x));
+  }
+};
+
+template <typename T>
+struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * sinh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sinh(x) = sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sinh(x));
+  }
+};
+
+template <typename T>
+struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cosh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Acosh(x) = acosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acosh(x));
+  }
+};
+
+template <typename T>
+struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1 / sqrt(x^2 - 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x - one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Asinh(x) = asinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asinh(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout * 1/sqrt(x^2 + 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x + one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Atanh(x) = atanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atanh(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1/(1- x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / (one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // atan(x) = atan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atan(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x^2)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (one + x * x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index aab31cfbd55b64..7634c2462738b2 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -592,5 +592,25 @@ void ElementwiseCompute(const GPUContext &dev_ctx,
 
 #endif
 
+template <typename DeviceContext,
+          typename T,
+          typename Functor,
+          typename InverseFunctor>
+void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
+                                const DenseTensor &x,
+                                const DenseTensor &y,
+                                DenseTensor *z,
+                                int axis = -1) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  dev_ctx.template Alloc<T>(z);
+  if (x_dims.size() >= y_dims.size()) {
+    funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
+  } else {
+    funcs::ElementwiseCompute<InverseFunctor, T>(
+        dev_ctx, x, y, axis, InverseFunctor(), z);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index dce80caab72bf3..139341536debf0 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -140,6 +140,72 @@ inline bool CheckDims(const DDim &dims_x, const DDim &dims_y) {
   return true;
 }
 
+// Just For Matrix OP, for example:
+// x's dim = [5, 3, 2, M, M] ; y's dim = [3, 1, M, N]
+// out [5, 3, 2], which is batch_size of matrix
+static inline std::vector<int64_t> MatrixGetBroadcastBatchPortion(
+    std::vector<int64_t> x, std::vector<int64_t> y) {
+  size_t size_x = x.size();
+  size_t size_y = y.size();
+  size_t size = std::max(size_x, size_y);
+  std::vector<int64_t> batchPortion(size);
+
+  ptrdiff_t i = (ptrdiff_t)size - 1;
+  for (; i >= 0; --i) {
+    ptrdiff_t offset = size - i - 1;
+    ptrdiff_t dim_x = size_x - offset - 1;
+    ptrdiff_t dim_y = size_y - offset - 1;
+    int64_t x_size = (dim_x >= 0) ? x[dim_x] : 1;
+    int64_t y_size = (dim_y >= 0) ? y[dim_y] : 1;
+
+    PADDLE_ENFORCE_EQ(
+        (x_size == y_size || x_size == 1 || y_size == 1),
+        true,
+        phi::errors::PreconditionNotMet(
+            "The size of tensor x (%d) must match the size of tensor y "
+            "(%d) at non-singleton dimension %d.",
+            x_size,
+            y_size,
+            i));
+
+    batchPortion[i] = x_size != 1 ? x_size : y_size;
+  }
+  return batchPortion;
+}
+
+// Just For Matrix OP, for example:
+// x's dim = [5, 3, 2, M, M] ; y's dim = [3, 1, M, N]
+// out shoule be [5, 3, 2, M, M] + [5, 3, 2, M, N], and [5, 3, 2] is
+// batch_size of matrix
+static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+MatrixGetBroadcastDims(const DenseTensor &x, const DenseTensor &y) {
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x.dims());
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y.dims());
+
+  std::vector<int64_t>::const_iterator f1 = x_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l1 = x_dims_vec.end() - 2;
+  std::vector<int64_t> x_dims_vec_cut(f1, l1);
+
+  std::vector<int64_t>::const_iterator f2 = y_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l2 = y_dims_vec.end() - 2;
+  std::vector<int64_t> y_dims_vec_cut(f2, l2);
+
+  std::vector<int64_t> expand_batch_portion =
+      MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut);
+
+  std::vector<int64_t> x_expand_size({expand_batch_portion});
+  x_expand_size.insert(x_expand_size.end(),
+                       {x_dims_vec[static_cast<int>(x_dims_vec.size()) - 2],
+                        x_dims_vec[static_cast<int>(x_dims_vec.size()) - 1]});
+
+  std::vector<int64_t> y_expand_size({expand_batch_portion});
+  y_expand_size.insert(y_expand_size.end(),
+                       {y_dims_vec[static_cast<int>(y_dims_vec.size()) - 2],
+                        y_dims_vec[static_cast<int>(y_dims_vec.size()) - 1]});
+
+  return std::make_tuple(x_expand_size, y_expand_size);
+}
+
 inline DDim GetOutputDims(const DDim &s_dims, const DDim &l_dims) {
   if (s_dims.size() > l_dims.size()) {
     return GetOutputDims(l_dims, s_dims);
diff --git a/paddle/phi/kernels/funcs/data_type_transform.h b/paddle/phi/kernels/funcs/data_type_transform.h
new file mode 100644
index 00000000000000..ad7f2aa192ce43
--- /dev/null
+++ b/paddle/phi/kernels/funcs/data_type_transform.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context>
+phi::DenseTensor TransDataType(const Context& dev_ctx,
+                               const phi::DenseTensor& x,
+                               DataType dtype) {
+  VLOG(3) << "TransDataType "
+          << "src type:" << x.dtype() << "; dst typoe: " << dtype;
+
+  switch (x.dtype()) {
+    case DataType::FLOAT32:
+      return phi::Cast<float>(dev_ctx, x, dtype);
+    case DataType::FLOAT64:
+      return phi::Cast<double>(dev_ctx, x, dtype);
+    case DataType::INT32:
+      return phi::Cast<int32_t>(dev_ctx, x, dtype);
+    case DataType::INT64:
+      return phi::Cast<int64_t>(dev_ctx, x, dtype);
+    case DataType::FLOAT16:
+      return phi::Cast<phi::dtype::float16>(dev_ctx, x, dtype);
+    case DataType::BFLOAT16:
+      return phi::Cast<phi::dtype::bfloat16>(dev_ctx, x, dtype);
+    case DataType::BOOL:
+      return phi::Cast<bool>(dev_ctx, x, dtype);
+    case DataType::INT16:
+      return phi::Cast<int16_t>(dev_ctx, x, dtype);
+    case DataType::UINT8:
+      return phi::Cast<uint8_t>(dev_ctx, x, dtype);
+    default:
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Data type (%s) is not supported when casting data type.",
+          x.dtype()));
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/diag_functor.h b/paddle/phi/kernels/funcs/diag_functor.h
index a806d1583a0b36..1862f5ec91b4bc 100644
--- a/paddle/phi/kernels/funcs/diag_functor.h
+++ b/paddle/phi/kernels/funcs/diag_functor.h
@@ -14,6 +14,14 @@
 
 #pragma once
 
+#include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
 namespace phi {
 namespace funcs {
 
@@ -25,5 +33,96 @@ inline int ComputeStride(int axis, phi::DDim dims) {
   return size;
 }
 
+template <typename T, typename ValueType>
+struct DiagAndFillFunctor {
+  DiagAndFillFunctor(const int m,
+                     const int n,
+                     const int num_lower_diags,
+                     const int num_upper_diags,
+                     const ValueType* scale,
+                     const T* input,
+                     T* output)
+      : m_(m),
+        n_(n),
+        num_lower_diags_(num_lower_diags),
+        num_upper_diags_(num_upper_diags),
+        scale_(scale),
+        input_(input),
+        output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int col = index % n_;
+    const int row = (index / n_) % m_;
+    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
+    const int band_end =
+        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
+    if (col < band_start || col >= band_end) {
+      output_[index] = input_[index];
+    } else if (col == band_end - 1) {
+      output_[index] = static_cast<T>(scale_[index % m_]);
+    } else {
+      output_[index] = input_[index];
+    }
+  }
+
+ private:
+  const int m_, n_, num_lower_diags_, num_upper_diags_;
+  const ValueType* scale_;
+  const T* input_;
+  T* output_;
+};
+
+template <typename T, typename ValueType, typename Context>
+DenseTensor DiagFill(const Context& dev_ctx,
+                     const int m,
+                     const int n,
+                     const int num_lower_diags,
+                     const int num_upper_diags,
+                     const DenseTensor& scale,
+                     const DenseTensor& input) {
+  DenseTensor out;
+  out.Resize(input.dims());
+  dev_ctx.template Alloc<T>(&out);
+  funcs::ForRange<Context> for_range(dev_ctx, input.numel());
+  DiagAndFillFunctor<T, ValueType> diag_and_copy_functor(
+      m,
+      n,
+      num_lower_diags,
+      num_upper_diags,
+      scale.data<ValueType>(),
+      input.data<T>(),
+      out.data<T>());
+  for_range(diag_and_copy_functor);
+  return out;
+}
+
+template <typename T, typename Context>
+DenseTensor BatchDiag(const Context& dev_ctx, const DenseTensor& x, int batch) {
+  DenseTensor out;
+  auto* x_data = x.data<phi::dtype::Real<T>>();
+  auto numel = x.numel();
+  out.Resize(x.dims());
+  auto* out_data = dev_ctx.template HostAlloc<phi::dtype::Real<T>>(
+      &out, static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
+
+  auto x_dims = x.dims();
+  int num_dims = x_dims.size();
+  std::vector<int> out_shape;
+
+  for (int i = 0; i < num_dims - 1; ++i) {
+    out_shape.push_back(x.dims()[i]);
+  }
+  out.Resize(phi::make_ddim(out_shape));
+  int order = x.dims()[num_dims - 1];
+  int stride_out = order * order;
+  int stride_in = order + 1;
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < order; ++j) {
+      out_data[i * order + j] = x_data[stride_out * i + stride_in * j];
+    }
+  }
+  return out;
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index 3ef39dc55d124b..acc31d68b78590 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -23,6 +23,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/hostdevice.h"
@@ -255,11 +256,13 @@ __global__ void DistributionKernel(size_t size,
   using SType = hiprandStatePhilox4_32_10_t;
 #endif
   size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
-  T args[kCount];
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  MT args[kCount];
   T result[kCount];
   for (size_t i = idx; i < size; i += total_thread * kCount) {
-    kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
-    kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(
+    kps::ElementwiseRandom<SType, MT, kCount, 1, DistOp>(
+        &args[0], dist, &state);
+    kps::ElementwiseUnary<MT, T, kCount, 1, 1, TransformOp>(
         &result[0], &args[0], trans);
     kps::WriteData<T, T, kCount, 1, 1, true>(
         out_data + i, &result[0], size - i, 1, stride, 1);
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index c0a3985cd1713b..b01d50015f01ad 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
@@ -92,5 +93,116 @@ struct InverseDivideFunctor {
   inline HOSTDEVICE T operator()(const T a, const T b) const { return b / a; }
 };
 
+template <typename T>
+using ComplexType = phi::dtype::complex<T>;
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor {
+  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a,
+                                                   const InT b,
+                                                   const InT c) {
+    // dx = dout / y
+    // dy = - dout * out / y
+    phi::Array<OutT, 2> outs;
+    outs[0] = a / c;
+    outs[1] = -a * b / c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor<ComplexType<InT>, ComplexType<OutT>> {
+  inline HOSTDEVICE phi::Array<ComplexType<OutT>, 2> operator()(
+      const ComplexType<InT> a,
+      const ComplexType<InT> b,
+      const ComplexType<InT> c) {
+    phi::Array<ComplexType<OutT>, 2> outs;
+    ComplexType<InT> c_conj(c.real, -c.imag);
+    ComplexType<InT> out_div_c_conj((b / c).real, -(b / c).imag);
+    outs[0] = a / c_conj;
+    outs[1] = -a * out_div_c_conj;
+    return outs;
+  }
+};
+
+// Float div grad
+template <typename T>
+struct DivGradXFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
+};
+
+// ComplexType div grad
+template <typename T>
+struct DivGradXFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b) const {
+    ComplexType<T> b_conj(b.real, -b.imag);
+    return a / b_conj;
+  }
+};
+
+// Float mul and div
+template <typename T>
+struct DivGradYFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b, const T c) const {
+    return -a * b / c;
+  }
+};
+
+// ComplexType mul and div
+template <typename T>
+struct DivGradYFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b,
+                                              const ComplexType<T> c) const {
+    ComplexType<T> out_div_c_conj((b / c).real, -(b / c).imag);
+    return -a * out_div_c_conj;
+  }
+};
+
+template <typename T>
+struct MultiplyGradFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; }
+};
+template <typename T>
+struct MultiplyGradFunctor<ComplexType<T>> {
+  inline HOSTDEVICE ComplexType<T> operator()(const ComplexType<T> a,
+                                              const ComplexType<T> b) const {
+    ComplexType<T> b_conj(b.real, -b.imag);
+    return a * b_conj;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MultiplyGradXYFunctor {
+  inline HOSTDEVICE phi::Array<OutT, 2> operator()(const InT a,
+                                                   const InT b,
+                                                   const InT c) {
+    phi::Array<OutT, 2> outs;
+    // dx = dout * y
+    outs[0] = a * b;
+    // dy = dout * x
+    outs[1] = a * c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MultiplyGradXYFunctor<ComplexType<InT>, ComplexType<OutT>> {
+  inline HOSTDEVICE phi::Array<ComplexType<OutT>, 2> operator()(
+      const ComplexType<InT> a,
+      const ComplexType<InT> b,
+      const ComplexType<InT> c) {
+    phi::Array<ComplexType<OutT>, 2> outs;
+    // dx = dout * y
+    ComplexType<InT> b_conj(b.real, -b.imag);
+    outs[0] = a * b_conj;
+    // dy = dout * x
+    ComplexType<InT> c_conj(c.real, -c.imag);
+    outs[1] = a * c_conj;
+    return outs;
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index dff0cfe5b8b901..17bf873587381c 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -24,6 +24,7 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 #endif
 
@@ -1758,5 +1759,31 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
 
 #endif
 
+template <typename DeviceContext,
+          typename T,
+          typename DX_OP,
+          typename DY_OP,
+          typename Tout = T>
+void ElemwiseGradCompute(const DeviceContext &dev_ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &y,
+                         const DenseTensor &out,
+                         const DenseTensor &dout,
+                         int axis,
+                         DenseTensor *dx,
+                         DenseTensor *dy,
+                         DX_OP dx_op,
+                         DY_OP dy_op) {
+  const DDim &x_dim = x.dims();
+  const DDim &y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP, Tout>(
+        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  } else {
+    ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
+        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/phi/kernels/funcs/isfinite_functor.h
similarity index 52%
rename from paddle/fluid/operators/isfinite_v2_op.h
rename to paddle/phi/kernels/funcs/isfinite_functor.h
index b646e460ec75b6..c804bee8d4c68c 100644
--- a/paddle/fluid/operators/isfinite_v2_op.h
+++ b/paddle/phi/kernels/funcs/isfinite_functor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,39 +14,32 @@
 
 #pragma once
 
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/isfinite_op.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace operators {
+namespace funcs {
 
 struct InfinityV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorContainsInfV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorContainsInfV2(tensor, out);
   }
 };
 
 struct NANV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorContainsNANV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorContainsNANV2(tensor, out);
   }
 };
 
 struct IsfiniteV2Functor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
-    framework::TensorIsfiniteV2(tensor, out);
+  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+    paddle::framework::TensorIsfiniteV2(tensor, out);
   }
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 8e1a4cdd1a9688..b735587d3d53df 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -125,5 +125,43 @@ struct TensorSetConstantXPU {
 };
 #endif
 
+template <typename Context, typename T>
+inline void TransCompute(const int dim,
+                         const Context& dev_ctx,
+                         const DenseTensor& in,
+                         DenseTensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      Transpose<Context, T, 1> trans1;
+      trans1(dev_ctx, in, out, axis);
+      break;
+    case 2:
+      Transpose<Context, T, 2> trans2;
+      trans2(dev_ctx, in, out, axis);
+      break;
+    case 3:
+      Transpose<Context, T, 3> trans3;
+      trans3(dev_ctx, in, out, axis);
+      break;
+    case 4:
+      Transpose<Context, T, 4> trans4;
+      trans4(dev_ctx, in, out, axis);
+      break;
+    case 5:
+      Transpose<Context, T, 5> trans5;
+      trans5(dev_ctx, in, out, axis);
+      break;
+    case 6:
+      Transpose<Context, T, 6> trans6;
+      trans6(dev_ctx, in, out, axis);
+      break;
+    default:
+      // for dim >= 7 situation
+      TransposeNormal<Context, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc
new file mode 100644
index 00000000000000..c95e97f8ea81a5
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
+                                                  const DenseTensor& a,
+                                                  DenseTensor* a_inv) {
+  ComputeInverseEigen<Context, T>(dev_ctx, a, a_inv);
+}
+
+template class MatrixInverseFunctor<CPUContext, float>;
+template class MatrixInverseFunctor<CPUContext, double>;
+
+// TODO(chenweihang): remove these instantiations later
+template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, float>;
+template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
new file mode 100644
index 00000000000000..686b8405bf7502
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
+                                                  const DenseTensor& a,
+                                                  DenseTensor* a_inv) {
+#ifndef PADDLE_WITH_HIP
+  const auto& mat_dims = a.dims();
+  const int rank = mat_dims.size();
+  int n = mat_dims[rank - 1];
+  int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
+
+  paddle::memory::allocation::AllocationPtr tmp_gpu_mat_data;
+  const T* gpu_mat = a.data<T>();
+  if (n >= 32) {
+    // Copy all elements of input matrix A to a temporary memory space to
+    // avoid being overriden by getrf.
+    tmp_gpu_mat_data = paddle::memory::Alloc(dev_ctx, a.numel() * sizeof(T));
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                         tmp_gpu_mat_data->ptr(),
+                         dev_ctx.GetPlace(),
+                         a.data(),
+                         a.numel() * sizeof(T),
+                         dev_ctx.stream());
+    gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
+  }
+
+  std::vector<const T*> cpu_ptrs(batch_size * 2);
+  for (int i = 0; i < batch_size; ++i) {
+    cpu_ptrs[i] = gpu_mat + i * n * n;
+    cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
+  }
+
+  // Copy the addresses of A and A_inv from host to device.
+  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+      paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*));
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       tmp_gpu_ptrs_data->ptr(),
+                       phi::CPUPlace(),
+                       static_cast<void*>(cpu_ptrs.data()),
+                       cpu_ptrs.size() * sizeof(T*),
+                       dev_ctx.stream());
+  T** gpu_inv_ptrs =
+      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+
+  // Allocate device memory for info and pivots.
+  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+  paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
+      paddle::memory::Alloc(dev_ctx, num_ints * sizeof(int));
+  int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  std::vector<int> info;  // only for singular checking
+  info.resize(batch_size);
+  // This functions in cuBLAS is intended to be used for matrices of small
+  // sizes where the launch overhead is a significant factor.
+  // TODO(Xreki): call function in cusolver for large matrices.
+  if (n < 32) {
+    // cublas<S/D>matinvBatched is a short cut of cublas<S/D>getrfBatched
+    // plus cublas<S/D>getriBatched.
+    // However it only works if N is less than 32. If not, we need to
+    // go through cublas<S/D>getrfBatched and cublas<S/D>getriBatched.
+    blas.BatchedMatInv(n,
+                       reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+                       gpu_inv_ptrs,
+                       gpu_info_ptr,
+                       batch_size);
+  } else {
+    // This function performs the LU factorization of each matrix A by the
+    // equation P * A = L * U. L and U are written back to original matrix A,
+    // and diagonal elements of L are discarded.
+    int* gpu_pivot_ptr =
+        reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
+    blas.BatchedGETRF(n,
+                      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
+                      gpu_pivot_ptr,
+                      gpu_info_ptr,
+                      batch_size);
+
+    blas.BatchedGETRI(n,
+                      reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+                      gpu_pivot_ptr,
+                      gpu_inv_ptrs,
+                      gpu_info_ptr,
+                      batch_size);
+  }
+  paddle::memory::Copy(phi::CPUPlace(),
+                       info.data(),
+                       dev_ctx.GetPlace(),
+                       gpu_info_ptr,
+                       sizeof(int) * batch_size,
+                       dev_ctx.stream());
+  for (int i = 0; i < batch_size; ++i) {
+    PADDLE_ENFORCE_EQ(info[i],
+                      0,
+                      phi::errors::PreconditionNotMet(
+                          "For batch [%d]: U(%d, %d) is zero, singular U. "
+                          "Please check the matrix value and change it to a "
+                          "non-singular matrix",
+                          i,
+                          info[i],
+                          info[i]));
+  }
+#else
+  ComputeInverseEigen<Context, T>(dev_ctx, a, a_inv);
+#endif
+}
+
+template class MatrixInverseFunctor<GPUContext, float>;
+template class MatrixInverseFunctor<GPUContext, double>;
+
+// TODO(chenweihang): remove these instantiations later
+template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext, float>;
+template class MatrixInverseFunctor<paddle::platform::CUDADeviceContext,
+                                    double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h
similarity index 61%
rename from paddle/fluid/operators/math/matrix_inverse.h
rename to paddle/phi/kernels/funcs/matrix_inverse.h
index fb58b483666526..c5b04a81065619 100644
--- a/paddle/fluid/operators/math/matrix_inverse.h
+++ b/paddle/phi/kernels/funcs/matrix_inverse.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,17 +17,18 @@ limitations under the License. */
 #include <string>
 #include "Eigen/Core"
 #include "Eigen/LU"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
 
-template <typename DeviceContext, typename T>
-void compute_inverse_eigen(const DeviceContext& context,
-                           const framework::Tensor& a,
-                           framework::Tensor* a_inv) {
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void ComputeInverseEigen(const Context& dev_ctx,
+                         const DenseTensor& a,
+                         DenseTensor* a_inv) {
   using Matrix =
       Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
   using EigenMatrixMap = Eigen::Map<Matrix>;
@@ -38,7 +39,7 @@ void compute_inverse_eigen(const DeviceContext& context,
   int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
 
   const T* a_ptr = a.data<T>();
-  T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
+  T* a_inv_ptr = a_inv->mutable_data<T>(dev_ctx.GetPlace());
 
   for (int i = 0; i < batch_size; ++i) {
     ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
@@ -47,20 +48,20 @@ void compute_inverse_eigen(const DeviceContext& context,
     lu.compute(mat);
 
     const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
-    PADDLE_ENFORCE_GT(
-        min_abs_pivot, static_cast<T>(0),
-        platform::errors::InvalidArgument("Input is not invertible."));
+    PADDLE_ENFORCE_GT(min_abs_pivot,
+                      static_cast<T>(0),
+                      errors::InvalidArgument("Input is not invertible."));
     mat_inv.noalias() = lu.inverse();
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename Context, typename T>
 class MatrixInverseFunctor {
  public:
-  void operator()(const DeviceContext& context, const framework::Tensor& a,
-                  framework::Tensor* a_inv);
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& a,
+                  DenseTensor* a_inv);
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cc b/paddle/phi/kernels/funcs/matrix_reduce.cc
new file mode 100644
index 00000000000000..849fd7a0075a89
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class MatrixReduceSumFunctor<T, CPUContext> {
+ public:
+  void operator()(const CPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out) {
+    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
+    // out_reduce_dim should be [0, 2]
+    const std::vector<int64_t> in_dims = phi::vectorize<int64_t>(in.dims());
+    auto in_size = in_dims.size();
+    const std::vector<int64_t> out_dims = phi::vectorize<int64_t>(out->dims());
+    auto out_size = out_dims.size();
+
+    std::vector<int64_t> out_bst_dims(in_size);
+
+    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
+    std::copy(out_dims.data(),
+              out_dims.data() + out_size,
+              out_bst_dims.data() + in_size - out_size);
+    out->Resize(phi::make_ddim(out_bst_dims));
+
+    std::vector<int64_t> out_reduce_dims;
+    for (size_t idx = 0; idx <= in_size - 3; idx++) {
+      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
+        out_reduce_dims.push_back(idx);
+      }
+    }
+    phi::ReduceKernelImpl<CPUContext, T, T, phi::funcs::SumFunctor>(
+        dev_ctx, in, out, out_reduce_dims, true, false);
+  }
+};
+
+template class MatrixReduceSumFunctor<float, CPUContext>;
+template class MatrixReduceSumFunctor<double, CPUContext>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cu b/paddle/phi/kernels/funcs/matrix_reduce.cu
new file mode 100644
index 00000000000000..5c3ebd6bb01671
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cu
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class MatrixReduceSumFunctor<T, GPUContext> {
+ public:
+  void operator()(const GPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out) {
+    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
+    // out_reduce_dim should be [0, 2]
+    const std::vector<int> in_dims = phi::vectorize<int>(in.dims());
+    auto in_size = in_dims.size();
+    const std::vector<int> out_dims = phi::vectorize<int>(out->dims());
+    auto out_size = out_dims.size();
+
+    std::vector<int> out_bst_dims(in_size);
+
+    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
+    std::copy(out_dims.data(),
+              out_dims.data() + out_size,
+              out_bst_dims.data() + in_size - out_size);
+    out->Resize(phi::make_ddim(out_bst_dims));
+
+    std::vector<int> out_reduce_dims;
+    for (size_t idx = 0; idx <= in_size - 3; idx++) {
+      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
+        out_reduce_dims.push_back(idx);
+      }
+    }
+    ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        dev_ctx, in, out, kps::IdentityFunctor<T>(), out_reduce_dims);
+  }
+};
+
+template class MatrixReduceSumFunctor<float, GPUContext>;
+template class MatrixReduceSumFunctor<double, GPUContext>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.h b/paddle/phi/kernels/funcs/matrix_reduce.h
new file mode 100644
index 00000000000000..22bddacd43d437
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_reduce.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+namespace funcs {
+
+// Use For Matrix OP, reduce_sum 'in' according to out's dim
+// for example: in's dim = [5, 3, 2, M, N] ; out's dim = [3, 1, M, N]
+// axis [0, 2] of DenseTensor 'in' will be reduced
+template <typename T, typename Context>
+class MatrixReduceSumFunctor {
+ public:
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out);
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 7df772682ecf9d..5834f091d9a4de 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -344,9 +344,8 @@ struct ReduceConfig {
                      const phi::GPUContext& dev_ctx,
                      phi::DenseTensor* tmp) {
     if (should_reduce_again) {
-      tmp->ResizeAndAllocate(phi::make_ddim(
+      tmp->Resize(phi::make_ddim(
           {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}));
-
       output_data = dev_ctx.Alloc<Ty>(tmp);
     } else {
       output_data = y_data;
@@ -1053,8 +1052,8 @@ CubTensorReduceImpl(const Tx* x_data,
                             reducer,
                             reducer.initial(),
                             stream);
-  phi::DenseTensor tmp =
-      phi::Empty<uint8_t>(dev_ctx, {static_cast<int64_t>(temp_storage_bytes)});
+  phi::DenseTensor tmp = phi::Empty<uint8_t, phi::GPUContext>(
+      dev_ctx, {static_cast<int64_t>(temp_storage_bytes)});
 
   auto* temp_storage = dev_ctx.Alloc<uint8_t>(&tmp);
 
@@ -1088,12 +1087,12 @@ template <typename Tx,
           typename Ty,
           template <typename> class ReduceOp,
           typename TransformOp>
-void TensorReduceImpl(const phi::GPUContext& dev_ctx,
-                      const phi::DenseTensor& x,
-                      phi::DenseTensor* y,
-                      const TransformOp& transform,
-                      const std::vector<int>& origin_reduce_dims,
-                      KPStream stream) {
+void ReduceKernel(const phi::GPUContext& dev_ctx,
+                  const phi::DenseTensor& x,
+                  phi::DenseTensor* y,
+                  const TransformOp& transform,
+                  const std::vector<int>& origin_reduce_dims) {
+  auto stream = dev_ctx.stream();
   dev_ctx.Alloc<Ty>(y);
 
   auto x_dim = phi::vectorize<int>(x.dims());
@@ -1106,7 +1105,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
   // y_data;
 
   phi::DDim tmp_ddim;
-  phi::DenseTensor tmp = phi::Empty<Ty>(dev_ctx);
+  phi::DenseTensor tmp;
 
   auto x_data = x.data<Tx>();
   auto y_data = y->data<Ty>();
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index aebd155ac59cb2..4e83d0fa371032 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -41,5 +41,13 @@ struct ProdFunctor {
   }
 };
 
+//////// Max Functor ///////
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
similarity index 65%
rename from paddle/fluid/operators/math/segment_pooling.cc
rename to paddle/phi/kernels/funcs/segment_pooling.cc
index d16fc570a9fb0d..bf4a21f37223da 100644
--- a/paddle/fluid/operators/math/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -12,45 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/segment_pooling.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
 
 #include <string>
-#include "paddle/fluid/framework/eigen.h"
 
-namespace paddle {
-namespace operators {
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
+
+using Tensor = DenseTensor;
 
 template <typename T, typename IndexT>
-class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
+class SegmentPoolFunctor<phi::CPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& segments, framework::Tensor* output,
-                  framework::Tensor* index,
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segments,
+                  DenseTensor* output,
+                  DenseTensor* index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
     auto curent_id = segment_ids[0];
     int64_t last_idx = 0;
     int64_t w = input.numel() / input.dims()[0];
-    auto& place = *context.eigen_device();
+    auto& place = *dev_ctx.eigen_device();
     for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
       if (idx < segments.numel()) {
         if (segment_ids[idx] == curent_id) continue;
-        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
-                          platform::errors::InvalidArgument(
+        PADDLE_ENFORCE_GE(segment_ids[idx],
+                          curent_id,
+                          phi::errors::InvalidArgument(
                               "The segment ids should be sorted, but got "
                               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                              idx - 1, curent_id, idx, segment_ids[idx]));
+                              idx - 1,
+                              curent_id,
+                              idx,
+                              segment_ids[idx]));
       }
 
       Tensor out_t = output->Slice(curent_id, curent_id + 1);
       Tensor in_t = input.Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
-      auto in_e = framework::EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
-      auto out_e = framework::EigenVector<T>::Flatten(out_t);
+      auto in_e = EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
+      auto out_e = EigenVector<T>::Flatten(out_t);
 
       auto reduce_dim = Eigen::array<int, 1>({{0}});
       if (pooltype == "MEAN") {
@@ -62,7 +69,7 @@ class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
       } else if (pooltype == "MIN") {
         out_e.device(place) = in_e.minimum(reduce_dim);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
             "available, but got %s.",
             pooltype));
@@ -75,36 +82,41 @@ class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
 };
 
 template <typename T, typename IndexT>
-class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
+class SegmentPoolGradFunctor<phi::CPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* index = nullptr,
+  void operator()(const phi::CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
-    auto& place = *context.eigen_device();
+    auto& place = *dev_ctx.eigen_device();
     auto curent_id = segment_ids[0];
     int64_t last_idx = 0;
     int64_t w = in_grad->numel() / in_grad->dims()[0];
     for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
       if (idx < segments.numel()) {
         if (segment_ids[idx] == curent_id) continue;
-        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
-                          platform::errors::InvalidArgument(
+        PADDLE_ENFORCE_GE(segment_ids[idx],
+                          curent_id,
+                          phi::errors::InvalidArgument(
                               "The segment ids should be sorted, but got "
                               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                              idx - 1, curent_id, idx, segment_ids[idx]));
+                              idx - 1,
+                              curent_id,
+                              idx,
+                              segment_ids[idx]));
       }
 
       Tensor out_g_t = out_grad.Slice(curent_id, curent_id + 1);
       Tensor in_g_t = in_grad->Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
-      auto in_g_e = framework::EigenMatrix<T>::From(in_g_t, {h, w});
-      auto out_g_e = framework::EigenMatrix<T>::From(out_g_t, {1, w});
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
       Eigen::DSizes<int, 2> bcast(h, 1);
 
       if (pooltype == "MEAN") {
@@ -114,13 +126,13 @@ class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
       } else if (pooltype == "MAX" || pooltype == "MIN") {
         Tensor out_t = output.Slice(curent_id, curent_id + 1);
         Tensor in_t = input.Slice(last_idx, idx);
-        auto in_e = framework::EigenMatrix<T>::From(in_t, {h, w});
-        auto out_e = framework::EigenMatrix<T>::From(out_t, {1, w});
+        auto in_e = EigenMatrix<T>::From(in_t, {h, w});
+        auto out_e = EigenMatrix<T>::From(out_t, {1, w});
         in_g_e.device(place) =
             (in_e == out_e.broadcast(bcast)).template cast<T>() *
             out_g_e.broadcast(bcast);
       } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        PADDLE_THROW(phi::errors::InvalidArgument(
             "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
             "available, but got %s.",
             pooltype));
@@ -132,7 +144,7 @@ class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
   }
 };
 
-using CPU = platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 template class SegmentPoolFunctor<CPU, float, int>;
 template class SegmentPoolFunctor<CPU, float, int64_t>;
 template class SegmentPoolFunctor<CPU, double, int>;
@@ -142,5 +154,5 @@ template class SegmentPoolGradFunctor<CPU, float, int64_t>;
 template class SegmentPoolGradFunctor<CPU, double, int>;
 template class SegmentPoolGradFunctor<CPU, double, int64_t>;
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
similarity index 54%
rename from paddle/fluid/operators/math/segment_pooling.cu
rename to paddle/phi/kernels/funcs/segment_pooling.cu
index fbdcb99c02ab97..305cd39f077bc3 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -12,20 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
 #include <algorithm>
-#include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
-using Tensor = framework::Tensor;
+using Tensor = DenseTensor;
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
+__global__ void SegmentSumIdsKernel(const Index* segment_ids,
+                                    T* summed_ids,
                                     const Index input_length_size,
                                     const Index total_stripe_count) {
   CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
@@ -45,16 +49,19 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
       PADDLE_ENFORCE(current_segment_id >= last_segment_id,
                      "the segment ids should be sorted, but got "
                      "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                     dim_index_base + j - 1, dim_index_base + j,
-                     last_segment_id, current_segment_id);
+                     dim_index_base + j - 1,
+                     dim_index_base + j,
+                     last_segment_id,
+                     current_segment_id);
       if (current_segment_id > last_segment_id) {
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(summed_ids + interval_id) = 0;
         }
         if (j > 0) {
           if (last_segment_id == first_segment_id) {
-            platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+            paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
             *(summed_ids + last_segment_id) = sum;
           }
@@ -64,13 +71,15 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
       sum += T(1);
       last_segment_id = current_segment_id;
     }
-    platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+    paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
   }
 }
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
-                                  T* output, T* summed_ids,
+__global__ void SegmentMeanKernel(const Index* segment_ids,
+                                  const T* input,
+                                  T* output,
+                                  T* summed_ids,
                                   const Index input_length_size,
                                   const Index inner_dim_size,
                                   const Index output_length_size,
@@ -93,7 +102,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
       if (current_segment_id > last_segment_id) {
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
 
@@ -102,8 +112,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
               last_segment_id * inner_dim_size + segment_offset;
 
           if (last_segment_id == first_segment_id) {
-            platform::CudaAtomicAdd(output + output_index,
-                                    sum / *(summed_ids + last_segment_id));
+            paddle::platform::CudaAtomicAdd(
+                output + output_index, sum / *(summed_ids + last_segment_id));
           } else {
             *(output + output_index) = sum / *(summed_ids + last_segment_id);
           }
@@ -114,15 +124,14 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
       last_segment_id = current_segment_id;
     }
     Index output_index = last_segment_id * inner_dim_size + segment_offset;
-    platform::CudaAtomicAdd(output + output_index,
-                            sum / *(summed_ids + last_segment_id));
+    paddle::platform::CudaAtomicAdd(output + output_index,
+                                    sum / *(summed_ids + last_segment_id));
   }
 }
 
 template <typename T, typename Index, typename Helper, typename Pool>
-__global__ void __launch_bounds__(1024, 1)
-    SegmentOpsKernel(const Index* segment_ids, const T* input, T* output,
-                     Helper h, Pool pool) {
+__global__ void __launch_bounds__(1024, 1) SegmentOpsKernel(
+    const Index* segment_ids, const T* input, T* output, Helper h, Pool pool) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     Index inner_dim_size = h.inner_dim_size;
@@ -142,13 +151,16 @@ __global__ void __launch_bounds__(1024, 1)
       PADDLE_ENFORCE(current_segment_id >= last_segment_id,
                      "The segment ids should be sorted, but got "
                      "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-                     dim_index_base + j - 1, dim_index_base + j,
-                     last_segment_id, current_segment_id);
+                     dim_index_base + j - 1,
+                     dim_index_base + j,
+                     last_segment_id,
+                     current_segment_id);
 
       if (current_segment_id > last_segment_id) {
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
-             interval_id < current_segment_id; ++interval_id) {
+             interval_id < current_segment_id;
+             ++interval_id) {
           *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
         // don't update result when j=0
@@ -175,9 +187,12 @@ __global__ void __launch_bounds__(1024, 1)
 }
 
 template <typename T, typename Index, typename Helper>
-__global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
-                                       const T* output, const T* out_grad,
-                                       T* in_grad, Helper h) {
+__global__ void SegmentIndexGradKernel(const Index* segment_ids,
+                                       const T* input,
+                                       const T* output,
+                                       const T* out_grad,
+                                       T* in_grad,
+                                       Helper h) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
@@ -201,7 +216,7 @@ class MaxPool {
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicMax(address, val);
+    return paddle::platform::CudaAtomicMax(address, val);
   }
 };
 
@@ -211,7 +226,7 @@ class MinPool {
   DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicMin(address, val);
+    return paddle::platform::CudaAtomicMin(address, val);
   }
 };
 
@@ -221,7 +236,7 @@ class SumPool {
   DEVICE inline T initial() { return static_cast<T>(0); }
   DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
   DEVICE inline T atomic(T* address, const T val) {
-    return platform::CudaAtomicAdd(address, val);
+    return paddle::platform::CudaAtomicAdd(address, val);
   }
 };
 
@@ -243,8 +258,10 @@ class ArrangeHelper {
     total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
   }
 
-  DEVICE inline void calculate(T stripe_index, T* segment_offset,
-                               T* dim_index_base, T* actual_height) {
+  DEVICE inline void calculate(T stripe_index,
+                               T* segment_offset,
+                               T* dim_index_base,
+                               T* actual_height) {
     *segment_offset = stripe_index % inner_dim_size;
     *dim_index_base = stripe_index / inner_dim_size * DimTileSize;
     *actual_height = min(DimTileSize, input_length_size - *dim_index_base);
@@ -252,23 +269,32 @@ class ArrangeHelper {
 };
 
 template <typename T, typename Index>
-void SegmentPoolCUDAGradFunctor(const platform::CUDADeviceContext& ctx,
-                                const framework::Tensor& input,
-                                const framework::Tensor& segment_ids,
-                                const framework::Tensor& output,
-                                const framework::Tensor& out_grad,
-                                framework::Tensor* in_grad,
+void SegmentPoolCUDAGradFunctor(const phi::GPUContext& ctx,
+                                const DenseTensor& input,
+                                const DenseTensor& segment_ids,
+                                const DenseTensor& output,
+                                const DenseTensor& out_grad,
+                                DenseTensor* in_grad,
                                 const std::string pooltype = "SUM") {
-  auto h = ArrangeHelper<Index>(input.numel(), segment_ids.dims()[0],
-                                output.dims()[0]);
-  auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+  auto h = ArrangeHelper<Index>(
+      input.numel(), segment_ids.dims()[0], output.dims()[0]);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
   if (pooltype == "MAX" || pooltype == "MIN") {
-    SegmentIndexGradKernel<T, Index, ArrangeHelper<Index>><<<
-        config.block_per_grid.x, config.thread_per_block.x, 0, ctx.stream()>>>(
-        segment_ids.data<Index>(), input.data<T>(), output.data<T>(),
-        out_grad.data<T>(), in_grad->data<T>(), h);
+    SegmentIndexGradKernel<T,
+                           Index,
+                           ArrangeHelper<Index>><<<config.block_per_grid.x,
+                                                   config.thread_per_block.x,
+                                                   0,
+                                                   ctx.stream()>>>(
+        segment_ids.data<Index>(),
+        input.data<T>(),
+        output.data<T>(),
+        out_grad.data<T>(),
+        in_grad->data<T>(),
+        h);
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
         "Unsupported segment pooling grad operation, Only MAX, MIN "
         "available, but got %s.",
         pooltype));
@@ -291,13 +317,13 @@ __global__ void SimpleDiv(T* x, const T* y, const int len, const int dim) {
 }
 
 template <typename T, typename IndexT>
-class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
+class SegmentPoolFunctor<phi::GPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& input,
-                  const framework::Tensor& segment_ids,
-                  framework::Tensor* output,
-                  framework::Tensor* summed_ids = nullptr,
+  void operator()(const phi::GPUContext& ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segment_ids,
+                  DenseTensor* output,
+                  DenseTensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MEAN") {
       // Sum the segment id num first
@@ -305,50 +331,76 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
       auto input_length_size = segment_ids.numel();
       auto total_stripe_count =
           (input_length_size + DimTileSize - 1) / DimTileSize;
-      auto config = platform::GetGpuLaunchConfig1D(ctx, total_stripe_count);
-      SegmentSumIdsKernel<
-          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                  config.thread_per_block.x, 0, ctx.stream()>>>(
-          segment_ids.data<IndexT>(), summed_ids->data<T>(), input_length_size,
+      auto config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(ctx, total_stripe_count);
+      SegmentSumIdsKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                                  config.thread_per_block.x,
+                                                  0,
+                                                  ctx.stream()>>>(
+          segment_ids.data<IndexT>(),
+          summed_ids->data<T>(),
+          input_length_size,
           total_stripe_count);
     }
 
-    auto h = ArrangeHelper<IndexT>(input.numel(), segment_ids.dims()[0],
-                                   output->dims()[0]);
-    auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
+    auto h = ArrangeHelper<IndexT>(
+        input.numel(), segment_ids.dims()[0], output->dims()[0]);
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
     if (pooltype == "MEAN") {
-      SegmentMeanKernel<
-          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
-                                  config.thread_per_block.x, 0, ctx.stream()>>>(
-          segment_ids.data<IndexT>(), input.data<T>(), output->data<T>(),
-          summed_ids->data<T>(), h.input_length_size, h.inner_dim_size,
-          h.output_length_size, h.total_stripe_count);
+      SegmentMeanKernel<T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                                config.thread_per_block.x,
+                                                0,
+                                                ctx.stream()>>>(
+          segment_ids.data<IndexT>(),
+          input.data<T>(),
+          output->data<T>(),
+          summed_ids->data<T>(),
+          h.input_length_size,
+          h.inner_dim_size,
+          h.output_length_size,
+          h.total_stripe_count);
     } else if (pooltype == "SUM") {
       SumPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          SumPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       SumPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else if (pooltype == "MAX") {
       MaxPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          MaxPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       MaxPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else if (pooltype == "MIN") {
       MinPool<T> pool;
-      SegmentOpsKernel<
-          T, IndexT, ArrangeHelper<IndexT>,
-          MinPool<T>><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                        ctx.stream()>>>(segment_ids.data<IndexT>(),
-                                        input.data<T>(), output->data<T>(), h,
-                                        pool);
+      SegmentOpsKernel<T,
+                       IndexT,
+                       ArrangeHelper<IndexT>,
+                       MinPool<T>><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     ctx.stream()>>>(segment_ids.data<IndexT>(),
+                                                     input.data<T>(),
+                                                     output->data<T>(),
+                                                     h,
+                                                     pool);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
           "available, but got %s.",
           pooltype));
@@ -357,33 +409,38 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
 };
 
 template <typename T, typename IndexT>
-class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
+class SegmentPoolGradFunctor<phi::GPUContext, T, IndexT> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* summed_ids = nullptr,
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> summed_ids,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MAX" || pooltype == "MIN") {
-      SegmentPoolCUDAGradFunctor<T, IndexT>(context, input, segments, output,
-                                            out_grad, in_grad, pooltype);
+      SegmentPoolCUDAGradFunctor<T, IndexT>(
+          dev_ctx, input, segments, output, out_grad, in_grad, pooltype);
     } else if (pooltype == "MEAN") {
-      framework::Tensor mean_grad;
-      mean_grad.mutable_data<T>(input.dims(), context.GetPlace());
-      framework::TensorCopy(out_grad, context.GetPlace(), context, &mean_grad);
+      DenseTensor mean_grad;
+      mean_grad.Resize(input.dims());
+      dev_ctx.template Alloc<T>(&mean_grad);
+      paddle::framework::TensorCopy(
+          out_grad, dev_ctx.GetPlace(), dev_ctx, &mean_grad);
       int len = output.dims()[0];
       int dim = output.numel() / len;
-      auto config = platform::GetGpuLaunchConfig1D(context, len);
-      SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
-                     context.stream()>>>(mean_grad.data<T>(),
-                                         summed_ids->data<T>(), len, dim);
-      phi::funcs::GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len);
+      SimpleDiv<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(
+          mean_grad.data<T>(), summed_ids->data<T>(), len, dim);
+      phi::funcs::GPUGather<T, IndexT>(dev_ctx, mean_grad, segments, in_grad);
     } else if (pooltype == "SUM") {
-      phi::funcs::GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(dev_ctx, out_grad, segments, in_grad);
     } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
           "available, but got %s.",
           pooltype));
@@ -391,15 +448,15 @@ class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
   }
 };
 
-using CUDA = paddle::platform::CUDADeviceContext;
-template class SegmentPoolFunctor<CUDA, float, int>;
-template class SegmentPoolFunctor<CUDA, float, int64_t>;
-template class SegmentPoolFunctor<CUDA, double, int>;
-template class SegmentPoolFunctor<CUDA, double, int64_t>;
-template class SegmentPoolGradFunctor<CUDA, float, int>;
-template class SegmentPoolGradFunctor<CUDA, float, int64_t>;
-template class SegmentPoolGradFunctor<CUDA, double, int>;
-template class SegmentPoolGradFunctor<CUDA, double, int64_t>;
-
-}  // namespace operators
-}  // namespace paddle
+using GPU = phi::GPUContext;
+template class SegmentPoolFunctor<GPU, float, int>;
+template class SegmentPoolFunctor<GPU, float, int64_t>;
+template class SegmentPoolFunctor<GPU, double, int>;
+template class SegmentPoolFunctor<GPU, double, int64_t>;
+template class SegmentPoolGradFunctor<GPU, float, int>;
+template class SegmentPoolGradFunctor<GPU, float, int64_t>;
+template class SegmentPoolGradFunctor<GPU, double, int>;
+template class SegmentPoolGradFunctor<GPU, double, int64_t>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/segment_pooling.h b/paddle/phi/kernels/funcs/segment_pooling.h
similarity index 51%
rename from paddle/fluid/operators/math/segment_pooling.h
rename to paddle/phi/kernels/funcs/segment_pooling.h
index 561fad6921fe7b..b8281061582ea6 100644
--- a/paddle/fluid/operators/math/segment_pooling.h
+++ b/paddle/phi/kernels/funcs/segment_pooling.h
@@ -14,33 +14,36 @@ limitations under the License. */
 
 #pragma once
 #include <string>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename Context, typename T, typename IndexT>
 class SegmentPoolFunctor {
  public:
   /* mean pool has summed_ids output */
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& segments, framework::Tensor* output,
-                  framework::Tensor* summed_ids = nullptr,
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& segments,
+                  DenseTensor* output,
+                  DenseTensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM");
 };
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename Context, typename T, typename IndexT>
 class SegmentPoolGradFunctor {
  public:
   /* mean pool has summed_ids output */
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& segments, framework::Tensor* in_grad,
-                  const framework::Tensor* summed_ids = nullptr,
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& output,
+                  const DenseTensor& out_grad,
+                  const DenseTensor& segments,
+                  DenseTensor* in_grad,
+                  paddle::optional<const DenseTensor&> summed_ids,
                   const std::string pooltype = "SUM");
 };
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/slice.h b/paddle/phi/kernels/funcs/slice.h
new file mode 100644
index 00000000000000..0a50dceb0a0075
--- /dev/null
+++ b/paddle/phi/kernels/funcs/slice.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T, size_t D>
+void EigenSliceWrapper(const Context& dev_ctx,
+                       const DenseTensor* in,
+                       const std::vector<int>& start,
+                       const std::vector<int>& end,
+                       DenseTensor* out) {
+  // Slice by call Eigen Tensor Function `.slice()`
+  size_t rank = in->dims().size();
+  PADDLE_ENFORCE_EQ(start.size(),
+                    rank,
+                    errors::InvalidArgument(
+                        "EigenSliceWrapper function start "
+                        "argument must have the same length as input rank."));
+  PADDLE_ENFORCE_EQ(end.size(),
+                    rank,
+                    errors::InvalidArgument(
+                        "EigenSliceWrapper function end "
+                        "argument must have the same length as input rank."));
+  auto eigen_place_ptr = dev_ctx.eigen_device();
+  auto eigen_place = *eigen_place_ptr;
+  auto out_t = phi::EigenTensor<T, D>::From(*out, out->dims());
+  auto in_t = phi::EigenTensor<T, D>::From(*in, in->dims());
+  Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
+  for (size_t i = 0; i < D; i++) {
+    offsets_32bit[i] = start[i];
+    extents_32bit[i] = end[i];
+  }
+  EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+      eigen_place,
+      phi::To32BitIndex(out_t),
+      phi::To32BitIndex(in_t),
+      offsets_32bit,
+      extents_32bit);
+}
+
+#define SLICE_RANK_CASE(N)                                                \
+  case N: {                                                               \
+    EigenSliceWrapper<Context, T, N>(dev_ctx, &x, offset, extends, &ret); \
+    break;                                                                \
+  }
+
+template <typename T, typename Context>
+DenseTensor Slice(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  std::vector<int> axes,
+                  std::vector<int> starts,
+                  std::vector<int> ends) {
+  DenseTensor ret;
+  std::vector<int> new_axes = axes;
+  std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+  size_t rank = out_shape.size();
+  PADDLE_ENFORCE_EQ(
+      axes.size(),
+      starts.size(),
+      errors::InvalidArgument("Slice Operator Argument Invalided"));
+  PADDLE_ENFORCE_EQ(
+      ends.size(),
+      starts.size(),
+      errors::InvalidArgument("Slice Operator Argument Invalided"));
+  for (unsigned int i = 0; i < axes.size(); ++i) {
+    int axis = axes[i];
+    if (axis < 0) axis = rank + axis;
+    new_axes[i] = axis;  // change negative to positive
+    int st = starts[i];
+    int ed = ends[i];
+    PADDLE_ENFORCE_GT(
+        ed,
+        st,
+        errors::InvalidArgument("C++ Slice Operation Not Support End < Start"));
+    out_shape[axis] = ed - st;
+  }
+  std::vector<int> offset(rank), extends(rank);
+  for (size_t i = 0; i < rank; ++i) {
+    offset[i] = 0;
+    extends[i] = x.dims()[i];
+  }
+  for (size_t i = 0; i < new_axes.size(); ++i) {
+    offset[new_axes[i]] = starts[i];
+    extends[new_axes[i]] = ends[i] - starts[i];
+  }
+  ret.Resize(phi::make_ddim(out_shape));
+  dev_ctx.template Alloc<T>(&ret);
+  switch (rank) {
+    SLICE_RANK_CASE(1);
+    SLICE_RANK_CASE(2);
+    SLICE_RANK_CASE(3);
+    SLICE_RANK_CASE(4);
+    SLICE_RANK_CASE(5);
+    SLICE_RANK_CASE(6);
+    default: {
+      PADDLE_THROW(
+          errors::InvalidArgument("Invalid Rank number, "
+                                  "currently only support rank between 2~6"));
+    }
+  }
+  return ret;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h
new file mode 100644
index 00000000000000..7b8a81471ef769
--- /dev/null
+++ b/paddle/phi/kernels/funcs/unsqueeze.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+// TODO(paddle-dev): Remove this file when we can call related Kernel directly
+
+namespace phi {
+namespace funcs {
+
+inline const DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
+  // don't copy data, only change the dims
+  DenseTensor out(x);
+  std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+  if (axis >= 0) {
+    auto index = (out_shape.begin() + axis);
+    out_shape.insert(index, 1);
+  } else if (axis < 0) {
+    auto index = (out_shape.end() + axis + 1);
+    out_shape.insert(index, 1);
+  }
+  out.Resize(phi::make_ddim(out_shape));
+  return out;
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
new file mode 100644
index 00000000000000..b3189fc5cc3c30
--- /dev/null
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -0,0 +1,386 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/memory/memory.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/backends/dynload/cusolver.h"
+#endif  // PADDLE_WITH_CUDA
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+namespace funcs {
+
+inline int64_t GetBatchSize(phi::DDim dims) {
+  int64_t batch_size = 1;
+  auto dim_size = dims.size();
+  for (int i = 0; i < dim_size - 2; i++) {
+    batch_size *= dims[i];
+  }
+  return batch_size;
+}
+
+static void CheckEighResult(const int batch, const int info) {
+  PADDLE_ENFORCE_LE(
+      info,
+      0,
+      phi::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] off-diagonal elements of an intermediate"
+          "tridiagonal form did not converge to zero",
+          batch,
+          info));
+  PADDLE_ENFORCE_GE(
+      info,
+      0,
+      phi::errors::PreconditionNotMet(
+          "For batch [%d]: the [%d] argument had an illegal value",
+          batch,
+          info));
+}
+
+template <typename DeviceContext, typename T>
+struct MatrixEighFunctor {
+  void operator()(const DeviceContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors);
+};
+
+// Calculates the eigenvalues ​​and eigenvectors of Hermitian or real
+// symmetric matrices, and uses the variable has_vectors to
+// control whether to return the eigenvectors.
+template <typename T>
+struct MatrixEighFunctor<CPUContext, T> {
+ public:
+  void operator()(const CPUContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors) {
+    using ValueType = phi::dtype::Real<T>;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
+
+    DenseTensor input_trans;
+    // lapack is a column-major storge, transpose make the input to
+    // have a continuous memory layout
+    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+
+    auto dims = input.dims();
+    int dim_size = dims.size();
+    int64_t batch_size = GetBatchSize(dims);
+
+    int vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    int values_stride = dims[dim_size - 1];
+    char uplo = is_lower ? 'L' : 'U';
+    char jobz = has_vectors ? 'V' : 'N';
+    int n = dims[dim_size - 1];
+    int64_t lda = std::max<int64_t>(1, n);
+    // if work = -1, it means that you need to use the lapack function to query
+    // the optimal value
+    int lwork = -1;      // The length of the array work
+    int lrwork = -1;     // The dimension of the array rwork,rwork is REAL array
+    int liwork = -1;     // The dimension of the array iwork
+    int iwork_opt = -1;  // The optimal length of the array liwork
+    T lwork_opt = static_cast<T>(-1);  // The optimal length of the array work
+    ValueType rwork_opt =
+        static_cast<ValueType>(-1);  // The optimal length of the array rwork
+
+    int info = 0;
+    // Call lapackEigh to get the optimal size of work data
+    phi::funcs::lapackEigh<T, ValueType>(jobz,
+                                         uplo,
+                                         n,
+                                         input_vector,
+                                         lda,
+                                         out_value,
+                                         &lwork_opt,
+                                         lwork,
+                                         &rwork_opt,
+                                         lrwork,
+                                         &iwork_opt,
+                                         liwork,
+                                         &info);
+    lwork = std::max<int>(1, static_cast<int>(lwork_opt));
+    liwork = std::max<int>(1, iwork_opt);
+
+    DenseTensor rwork_tensor;
+    ValueType *rwork_data = nullptr;
+
+    // complex type
+    if (input.type() == phi::DataType::COMPLEX64 ||
+        input.type() == phi::DataType::COMPLEX128) {
+      lrwork = std::max<int>(1, static_cast<int>(rwork_opt));
+
+      rwork_tensor.Resize(phi::make_ddim({lrwork}));
+      rwork_data = dev_ctx.template Alloc<ValueType>(&rwork_tensor);
+    }
+
+    DenseTensor iwork_tensor, work_tensor;
+
+    iwork_tensor.Resize(phi::make_ddim({liwork}));
+    int *iwork_data = dev_ctx.template Alloc<int>(&iwork_tensor);
+
+    work_tensor.Resize(phi::make_ddim({lwork}));
+    T *work_data = dev_ctx.template Alloc<T>(&work_tensor);
+
+    for (auto i = 0; i < batch_size; i++) {
+      auto *value_data = out_value + i * values_stride;
+      auto *input_data = input_vector + i * vector_stride;
+      phi::funcs::lapackEigh<T, ValueType>(jobz,
+                                           uplo,
+                                           n,
+                                           input_data,
+                                           lda,
+                                           value_data,
+                                           work_data,
+                                           lwork,
+                                           rwork_data,
+                                           lrwork,
+                                           iwork_data,
+                                           liwork,
+                                           &info);
+      CheckEighResult(i, info);
+    }
+    if (has_vectors) {
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              phi::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated, "
+                                  "so the eigenvectors must be provided."));
+      input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
+    }
+  }
+};
+
+#ifdef PADDLE_WITH_CUDA
+
+// Calculates the eigenvalues ​​and eigenvectors of Hermitian or real
+// symmetric matrices on GPU, and uses the variable has_vectors
+// to control whether to return the eigenvectors.
+template <typename T>
+struct MatrixEighFunctor<GPUContext, T> {
+ public:
+  void operator()(const GPUContext &dev_ctx,
+                  const DenseTensor &input,
+                  DenseTensor *eigen_values,
+                  DenseTensor *eigen_vectors,
+                  bool is_lower,
+                  bool has_vectors) {
+    using ValueType = phi::dtype::Real<T>;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
+
+    DenseTensor input_trans;
+    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+    auto &dims = input.dims();
+    int dim_size = dims.size();
+    int64_t batch_size = GetBatchSize(dims);
+
+    cublasFillMode_t uplo =
+        is_lower ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+    cusolverEigMode_t jobz =
+        has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
+
+    int n = dims[dim_size - 1];
+    int lda = std::max<int>(1, n);
+    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    auto values_stride = dims[dim_size - 1];
+    int lwork = 0;
+    auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size);
+    auto *info_ptr = reinterpret_cast<int *>(info->ptr());
+
+    // When the input type is float32, and the feature value input dimension
+    // is greater than or equal to [*,32,32]  and less than or equal to
+    // [*,512,512], Syevj has better performance.
+    bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 &&
+                      values_stride >= 32 && values_stride <= 512);
+    syevjInfo_t syevj_params;
+    if (use_syevj) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cusolverDnCreateSyevjInfo(&syevj_params));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
+          dev_ctx.cusolver_dn_handle(),
+          jobz,
+          uplo,
+          n,
+          reinterpret_cast<const float *>(input_vector),
+          lda,
+          reinterpret_cast<const float *>(out_value),
+          &lwork,
+          syevj_params));
+    } else {
+      EvdBuffer(dev_ctx.cusolver_dn_handle(),
+                jobz,
+                uplo,
+                n,
+                input_vector,
+                lda,
+                out_value,
+                &lwork);
+    }
+    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * lwork);
+    auto *work_ptr = reinterpret_cast<T *>(work->ptr());
+    for (auto i = 0; i < batch_size; i++) {
+      auto *input_data = input_vector + i * vector_stride;
+      auto *value_data = out_value + i * values_stride;
+      auto handle = dev_ctx.cusolver_dn_handle();
+      if (use_syevj) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            dynload::cusolverDnSsyevj(handle,
+                                      jobz,
+                                      uplo,
+                                      n,
+                                      reinterpret_cast<float *>(input_data),
+                                      lda,
+                                      reinterpret_cast<float *>(value_data),
+                                      reinterpret_cast<float *>(work_ptr),
+                                      lwork,
+                                      info_ptr,
+                                      syevj_params));
+      } else {
+        Evd(handle,
+            jobz,
+            uplo,
+            n,
+            input_data,
+            lda,
+            value_data,
+            work_ptr,
+            lwork,
+            info_ptr);
+      }
+      int error_info = 0;
+      paddle::memory::Copy(phi::CPUPlace(),
+                           &error_info,
+                           dev_ctx.GetPlace(),
+                           info_ptr,
+                           sizeof(int),
+                           dev_ctx.stream());
+      CheckEighResult(i, error_info);
+    }
+
+    if (use_syevj) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cusolverDnDestroySyevjInfo(syevj_params));
+    }
+    if (has_vectors) {
+      PADDLE_ENFORCE_NOT_NULL(eigen_vectors,
+                              phi::errors::InvalidArgument(
+                                  "When has_vectors is true,"
+                                  "the eigenvectors needs to be calculated,"
+                                  "so the eigenvectors must be provided."));
+      //   input_trans = dito.Transpose(input_trans);
+      input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input_trans);
+      eigen_vectors->ShareDataWith(input_trans);
+    }
+  }
+
+  using ValueType = phi::dtype::Real<T>;
+  inline void EvdBuffer(cusolverDnHandle_t handle,
+                        cusolverEigMode_t jobz,
+                        cublasFillMode_t uplo,
+                        int n,
+                        const T *A,
+                        int lda,
+                        const ValueType *W,
+                        int *lwork) const;
+
+  inline void Evd(cusolverDnHandle_t handle,
+                  cusolverEigMode_t jobz,
+                  cublasFillMode_t uplo,
+                  int n,
+                  T *A,
+                  int lda,
+                  ValueType *W,
+                  T *work,
+                  int lwork,
+                  int *devInfo) const;
+};
+
+using phi::dtype::complex;
+
+#define FUNC_WITH_TYPES(m)                       \
+  m(float, Ssy, float) m(double, Dsy, double) m( \
+      complex<float>, Che, cuComplex) m(complex<double>, Zhe, cuDoubleComplex)
+
+#define EVDBUFFER_INSTANCE(T, C, CastType)                             \
+  template <>                                                          \
+  inline void MatrixEighFunctor<GPUContext, T>::EvdBuffer(             \
+      cusolverDnHandle_t handle,                                       \
+      cusolverEigMode_t jobz,                                          \
+      cublasFillMode_t uplo,                                           \
+      int n,                                                           \
+      const T *A,                                                      \
+      int lda,                                                         \
+      const ValueType *W,                                              \
+      int *lwork) const {                                              \
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##evd_bufferSize( \
+        handle,                                                        \
+        jobz,                                                          \
+        uplo,                                                          \
+        n,                                                             \
+        reinterpret_cast<const CastType *>(A),                         \
+        lda,                                                           \
+        W,                                                             \
+        lwork));                                                       \
+  }
+
+FUNC_WITH_TYPES(EVDBUFFER_INSTANCE);
+
+#define EVD_INSTANCE(T, C, CastType)                                           \
+  template <>                                                                  \
+  inline void MatrixEighFunctor<GPUContext, T>::Evd(cusolverDnHandle_t handle, \
+                                                    cusolverEigMode_t jobz,    \
+                                                    cublasFillMode_t uplo,     \
+                                                    int n,                     \
+                                                    T *A,                      \
+                                                    int lda,                   \
+                                                    ValueType *W,              \
+                                                    T *work,                   \
+                                                    int lwork,                 \
+                                                    int *devInfo) const {      \
+    PADDLE_ENFORCE_GPU_SUCCESS(                                                \
+        dynload::cusolverDn##C##evd(handle,                                    \
+                                    jobz,                                      \
+                                    uplo,                                      \
+                                    n,                                         \
+                                    reinterpret_cast<CastType *>(A),           \
+                                    lda,                                       \
+                                    W,                                         \
+                                    reinterpret_cast<CastType *>(work),        \
+                                    lwork,                                     \
+                                    devInfo));                                 \
+  }
+
+FUNC_WITH_TYPES(EVD_INSTANCE);
+
+#undef FUNC_WITH_TYPES
+#undef EVDBUFFER_INSTANCE
+#undef EVD_INSTANCE
+
+#endif  // PADDLE_WITH_CUDA
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/viterbi_decode_functor.h b/paddle/phi/kernels/funcs/viterbi_decode_functor.h
new file mode 100644
index 00000000000000..b80fd5356b6e83
--- /dev/null
+++ b/paddle/phi/kernels/funcs/viterbi_decode_functor.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+static std::vector<DenseTensor> Unbind(const DenseTensor& in) {
+  int64_t size = in.dims()[0];
+  std::vector<DenseTensor> tensors(size);
+  for (int64_t i = 0; i < size; ++i) {
+    tensors[i] = in.Slice(i, i + 1);
+  }
+  return tensors;
+}
+
+template <typename T, typename Functor, typename OutT = T>
+void SameDimsBinaryOP(const DenseTensor& lhs,
+                      const DenseTensor& rhs,
+                      DenseTensor* out) {
+  const T* lhs_ptr = lhs.data<T>();
+  const T* rhs_ptr = rhs.data<T>();
+  OutT* out_ptr = out->data<OutT>();
+  Functor functor;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < out->numel(); ++i) {
+    out_ptr[i] = functor(lhs_ptr[i], rhs_ptr[i]);
+  }
+}
+
+template <bool is_multi_threads>
+struct GetInputIndex {
+  void operator()(const std::vector<int>& lhs_dims,
+                  const std::vector<int>& rhs_dims,
+                  const std::vector<int>& output_dims,
+                  const std::vector<int>& lhs_strides,
+                  const std::vector<int>& rhs_strides,
+                  const std::vector<int>& output_strides,
+                  int output_idx,
+                  int* index_array,
+                  int* lhs_idx,
+                  int* rhs_idx) {
+    int out_dims_size = output_strides.size();
+    for (int j = 0; j < out_dims_size; ++j) {
+      int curr_idx = output_idx / output_strides[j];
+      output_idx %= output_strides[j];
+      *lhs_idx += (lhs_dims[j] > 1) ? curr_idx * lhs_strides[j] : 0;
+      *rhs_idx += (rhs_dims[j] > 1) ? curr_idx * rhs_strides[j] : 0;
+    }
+  }
+};
+
+template <typename T, typename Functor, bool is_multi_threads = false>
+void SimpleBroadcastBinaryOP(const DenseTensor& lhs,
+                             const DenseTensor& rhs,
+                             DenseTensor* out) {
+  const T* lhs_ptr = lhs.data<T>();
+  const T* rhs_ptr = rhs.data<T>();
+  T* out_ptr = out->data<T>();
+  int out_size = static_cast<int>(out->dims().size());
+  std::vector<int> out_dims(out_size);
+  std::vector<int> lhs_dims(out_size);
+  std::vector<int> rhs_dims(out_size);
+  std::copy(lhs.dims().Get(), lhs.dims().Get() + out_size, lhs_dims.data());
+  std::copy(rhs.dims().Get(), rhs.dims().Get() + out_size, rhs_dims.data());
+  std::copy(out->dims().Get(), out->dims().Get() + out_size, out_dims.data());
+  std::vector<int> output_strides(out_size, 1);
+  std::vector<int> lhs_strides(out_size, 1);
+  std::vector<int> rhs_strides(out_size, 1);
+  std::vector<int> index_array(out_size, 0);
+  // calculate strides
+  for (int i = out_size - 2; i >= 0; --i) {
+    output_strides[i] = output_strides[i + 1] * out_dims[i + 1];
+    lhs_strides[i] = lhs_strides[i + 1] * lhs_dims[i + 1];
+    rhs_strides[i] = rhs_strides[i + 1] * rhs_dims[i + 1];
+  }
+  Functor functor;
+  GetInputIndex<is_multi_threads> get_input_index;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < out->numel(); ++i) {
+    int lhs_idx = 0;
+    int rhs_idx = 0;
+    get_input_index(lhs_dims,
+                    rhs_dims,
+                    out_dims,
+                    lhs_strides,
+                    rhs_strides,
+                    output_strides,
+                    i,
+                    index_array.data(),
+                    &lhs_idx,
+                    &rhs_idx);
+    out_ptr[i] = functor(lhs_ptr[lhs_idx], rhs_ptr[rhs_idx]);
+  }
+}
+
+class TensorBuffer {
+ public:
+  explicit TensorBuffer(const DenseTensor& in) : buffer_(in), offset_(0) {
+    buffer_.Resize({buffer_.numel()});
+  }
+  DenseTensor GetBufferBlock(std::initializer_list<int64_t> shape) {
+    int64_t size = std::accumulate(
+        shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+    DenseTensor block = buffer_.Slice(offset_, offset_ + size);
+    offset_ += size;
+    block.Resize(shape);
+    return block;
+  }
+
+ private:
+  DenseTensor buffer_;  // need to resize 1-D Tensor
+  int offset_;
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/yolo_box_util.h b/paddle/phi/kernels/funcs/yolo_box_util.h
new file mode 100644
index 00000000000000..337af2d7a236e9
--- /dev/null
+++ b/paddle/phi/kernels/funcs/yolo_box_util.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+HOSTDEVICE inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+template <typename T>
+HOSTDEVICE inline void GetYoloBox(T* box,
+                                  const T* x,
+                                  const int* anchors,
+                                  int i,
+                                  int j,
+                                  int an_idx,
+                                  int grid_size_h,
+                                  int grid_size_w,
+                                  int input_size_h,
+                                  int input_size_w,
+                                  int index,
+                                  int stride,
+                                  int img_height,
+                                  int img_width,
+                                  float scale,
+                                  float bias) {
+  box[0] = (i + sigmoid<T>(x[index]) * scale + bias) * img_width / grid_size_w;
+  box[1] = (j + sigmoid<T>(x[index + stride]) * scale + bias) * img_height /
+           grid_size_h;
+  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
+           input_size_w;
+  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
+           img_height / input_size_h;
+}
+
+HOSTDEVICE inline int GetEntryIndex(int batch,
+                                    int an_idx,
+                                    int hw_idx,
+                                    int an_num,
+                                    int an_stride,
+                                    int stride,
+                                    int entry,
+                                    bool iou_aware) {
+  if (iou_aware) {
+    return (batch * an_num + an_idx) * an_stride +
+           (batch * an_num + an_num + entry) * stride + hw_idx;
+  } else {
+    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+  }
+}
+
+HOSTDEVICE inline int GetIoUIndex(
+    int batch, int an_idx, int hw_idx, int an_num, int an_stride, int stride) {
+  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
+         hw_idx;
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcDetectionBox(T* boxes,
+                                        T* box,
+                                        const int box_idx,
+                                        const int img_height,
+                                        const int img_width,
+                                        bool clip_bbox) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+
+  if (clip_bbox) {
+    boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
+    boxes[box_idx + 1] =
+        boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
+    boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                             ? boxes[box_idx + 2]
+                             : static_cast<T>(img_width - 1);
+    boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                             ? boxes[box_idx + 3]
+                             : static_cast<T>(img_height - 1);
+  }
+}
+
+template <typename T>
+HOSTDEVICE inline void CalcLabelScore(T* scores,
+                                      const T* input,
+                                      const int label_idx,
+                                      const int score_idx,
+                                      const int class_num,
+                                      const T conf,
+                                      const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
new file mode 100644
index 00000000000000..f08fb74e54d8c8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/accuracy_kernel.h"
+
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <int BlockSize>
+__global__ void AccuracyCudaKernel(const int N,
+                                   const int D,
+                                   const int64_t* Xdata,
+                                   const int64_t* labeldata,
+                                   int* correct_data,
+                                   float* accuracy,
+                                   int* total_data) {
+  int count = 0;
+  __shared__ int total[BlockSize];
+
+  // support only 1 block
+  for (int i = threadIdx.x; i < (N); i += BlockSize) {
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] == labeldata[i]) {
+        ++count;
+        break;
+      }
+    }
+  }
+  total[threadIdx.x] = count;
+  __syncthreads();
+
+// reduce the count with init value 0, and output accuracy.
+#ifdef PADDLE_WITH_CUDA
+  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
+#else
+  // HIP thrust::reduce not support __device__
+  for (int s = BlockSize / 2; s > 0; s >>= 1) {
+    if (threadIdx.x < s) {
+      total[threadIdx.x] += total[threadIdx.x + s];
+    }
+    __syncthreads();
+  }
+  int result = total[0];
+#endif
+  if (threadIdx.x == 0) {
+    *correct_data = result;
+    *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *total_data = N;
+  }
+}
+
+template <typename T, typename Context>
+void AccuracyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& inference,
+                       const DenseTensor& indices,
+                       const DenseTensor& label,
+                       DenseTensor* accuracy,
+                       DenseTensor* correct,
+                       DenseTensor* total) {
+  // FIXME(typhoonzero): only support indices currently
+  // if add support for output values, how to detect the data type?
+  const int64_t* indices_data = indices.data<int64_t>();
+  const int64_t* label_data = label.data<int64_t>();
+
+  int* correct_data = dev_ctx.template Alloc<int>(correct);
+  int* total_data = dev_ctx.template Alloc<int>(total);
+  float* accuracy_data = dev_ctx.template Alloc<float>(accuracy);
+
+  int num_samples = static_cast<int>(inference.dims()[0]);
+  size_t infer_width = inference.dims()[1];
+  auto stream = dev_ctx.stream();
+  phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
+
+  if (num_samples == 0) {
+    return;
+  }
+
+  AccuracyCudaKernel<
+      PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+      num_samples,
+      infer_width,
+      indices_data,
+      label_data,
+      correct_data,
+      accuracy_data,
+      total_data);
+}
+}  // namespace phi
+
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
+PD_REGISTER_KERNEL(accuracy,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AccuracyRawKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
new file mode 100644
index 00000000000000..c2995c79a7e8c2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -0,0 +1,221 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGradGPUImpl(const Context& dev_ctx,
+                           const DenseTensor* x,
+                           const DenseTensor* out,
+                           const DenseTensor* d_out,
+                           DenseTensor* d_x,
+                           const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(
+      d_out, errors::NotFound("The input DenseTensor dOut can not be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      d_x, errors::NotFound("The output DenseTensor dX can not be nullptr"));
+  if (!out) {
+    out = d_out;  // fake out
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        x, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    x = d_x;
+  }
+
+  dev_ctx.template Alloc<T>(d_x);
+
+  std::vector<const DenseTensor*> ins = {d_out};
+  std::vector<DenseTensor*> outs = {d_x};
+
+  if (static_cast<int>(Functor::FwdDeps()) ==
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    // Only need forward output Out
+    ins.push_back(out);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else if (static_cast<int>(Functor::FwdDeps()) ==
+             static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    // Only need forward input X
+    ins.push_back(x);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else {
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  }
+}
+
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+  template <typename T, typename Context>                           \
+  void name##GradKernel(const Context& dev_ctx,                     \
+                        const DenseTensor& x,                       \
+                        const DenseTensor& dout,                    \
+                        DenseTensor* dx) {                          \
+    functor_class functor;                                          \
+    ActivationGradGPUImpl<T, Context, functor_class>(               \
+        dev_ctx, &x, nullptr, &dout, dx, functor);                  \
+  }
+
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+  template <typename T, typename Context>                             \
+  void name##GradKernel(const Context& dev_ctx,                       \
+                        const DenseTensor& out,                       \
+                        const DenseTensor& dout,                      \
+                        DenseTensor* dx) {                            \
+    functor_class functor;                                            \
+    ActivationGradGPUImpl<T, Context, functor_class>(                 \
+        dev_ctx, nullptr, &out, &dout, dx, functor);                  \
+  }
+
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor<T>);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor<T>);
+
+}  // namespace phi
+PD_REGISTER_KERNEL(cos_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CosGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(tan_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TanGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acos_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcosGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sin_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SinGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asin_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atan_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sinh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SinhGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(cosh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CoshGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asinh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinhGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acosh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcoshGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atanh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanhGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(relu_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(relu_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
new file mode 100644
index 00000000000000..26752b89e7c345
--- /dev/null
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -0,0 +1,143 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/activation_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGPUImpl(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out,
+                       const Functor& functor) {
+  PADDLE_ENFORCE_NOT_NULL(out,
+                          errors::NotFound("Output Out should not be nullptr"));
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
+#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class)                   \
+  template <typename T, typename Context>                                   \
+  void name##Kernel(                                                        \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {     \
+    functor_class functor;                                                  \
+    ActivationGPUImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+  }
+
+DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor<T>)
+DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor<T>)
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(relu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(relu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReluKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
+PD_REGISTER_KERNEL(
+    sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(
+    cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(
+    tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acos,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcosKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asin,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atan,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sinh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SinhKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(cosh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CoshKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(asinh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AsinhKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(acosh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AcoshKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(atanh,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AtanhKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/adadelta_kernel.cu b/paddle/phi/kernels/gpu/adadelta_kernel.cu
new file mode 100644
index 00000000000000..7516a277a746f1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/adadelta_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adadelta_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    adadelta, GPU, ALL_LAYOUT, phi::AdadeltaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu
new file mode 100644
index 00000000000000..0817c531318c39
--- /dev/null
+++ b/paddle/phi/kernels/gpu/adamax_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/adamax_kernel_impl.h"
+
+PD_REGISTER_KERNEL(adamax, GPU, ALL_LAYOUT, phi::AdamaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
new file mode 100644
index 00000000000000..6feee512cc9f4e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include <limits>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+
+namespace {  // NOLINT
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+
+}  // end namespace
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+template <typename T, typename IndType, class Reducer, size_t BlockDim>
+__global__ void ArgCUDAKernel(const int64_t height,     // n * h
+                              const int64_t width,      // c
+                              const int64_t post_size,  // h
+                              const Reducer reducer,
+                              const T init,
+                              const T* in,
+                              IndType* out) {
+  typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      out[idx] = static_cast<IndType>(kv_pair.key);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename IndType, class Reducer>
+void ComputeFullArg(const phi::GPUContext& dev_ctx,
+                    const DenseTensor& input,
+                    DenseTensor* indices,
+                    const int64_t pre,
+                    const int64_t post,
+                    const int64_t n) {
+  auto cu_stream = dev_ctx.stream();
+  auto ComputeBlockSize = [](int64_t col) {
+    auto block_size = 8;
+    if (col > 512)
+      block_size = 1024;
+    else if (col > 256)
+      block_size = 512;
+    else if (col > 128)
+      block_size = 256;
+    else if (col > 64)
+      block_size = 128;
+    else if (col > 32)
+      block_size = 64;
+    else if (col > 16)
+      block_size = 32;
+    else if (col > 8)
+      block_size = 16;
+#ifdef __HIPCC__
+    block_size = std::min(block_size, 256);
+#endif
+    return block_size;
+  };
+
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t height = pre * post;
+  int64_t width = n;
+  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+
+  const T* in_data = input.data<T>();
+  IndType* out_data = dev_ctx.template Alloc<IndType>(indices);
+
+  if (typeid(Reducer) == typeid(cub::ArgMax)) {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T,
+                        IndType,
+                        Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::lowest(),
+              in_data,
+              out_data));
+    }
+  } else {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgCUDAKernel<T,
+                        IndType,
+                        Reducer,
+                        kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::max(),
+              in_data,
+              out_data));
+    }
+  }
+}
+
+template <typename Context, typename T, class Reducer>
+struct VisitDataCudaArgMinMaxFunctor {
+  const Context& dev_ctx;
+  const DenseTensor& x;
+  int64_t axis;
+  bool keepdims;
+  bool flatten;
+  DenseTensor* out;
+
+  explicit VisitDataCudaArgMinMaxFunctor(const Context& dev_ctx,
+                                         const DenseTensor& x,
+                                         int64_t axis,
+                                         bool keepdims,
+                                         bool flatten,
+                                         DenseTensor* out)
+      : dev_ctx(dev_ctx),
+        x(x),
+        axis(axis),
+        keepdims(keepdims),
+        flatten(flatten),
+        out(out) {}
+
+  template <typename IndType>
+  void apply() const {
+    phi::DDim x_dims;
+    int new_axis = axis;
+    if (flatten) {
+      x_dims = phi::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      new_axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) new_axis = axis + x.dims().size();
+    }
+
+    int64_t numel = x.numel();
+    int64_t groups = numel / x_dims[new_axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = x_dims[new_axis];
+
+    for (int i = 0; i < new_axis; i++) {
+      pre *= x_dims[i];
+    }
+
+    for (int i = new_axis + 1; i < x_dims.size(); i++) {
+      post *= x_dims[i];
+    }
+
+    ComputeFullArg<T, IndType, Reducer>(dev_ctx, x, out, pre, post, n);
+  }
+};
+
+template <typename Context, typename T, class Reducer>
+void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           int64_t axis,
+                           bool keepdims,
+                           bool flatten,
+                           int dtype,
+                           DenseTensor* out) {
+  if (dtype < 0) {
+    paddle::framework::VisitDataTypeTiny(
+        static_cast<paddle::framework::proto::VarType::Type>(
+            paddle::framework::proto::VarType::INT64),
+        VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
+            dev_ctx, x, axis, keepdims, flatten, out));
+    return;
+  }
+  paddle::framework::VisitDataTypeTiny(
+      static_cast<paddle::framework::proto::VarType::Type>(dtype),
+      VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
+          dev_ctx, x, axis, keepdims, flatten, out));
+}
+
+template <typename T, typename Context>
+void ArgMinKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMin>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+template <typename T, typename Context>
+void ArgMaxKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int64_t axis,
+                  bool keepdims,
+                  bool flatten,
+                  int dtype,
+                  DenseTensor* out) {
+  ArgMinMaxOpCUDAKernel<Context, T, cub::ArgMax>(
+      dev_ctx, x, axis, keepdims, flatten, dtype, out);
+}
+
+#endif
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(arg_min,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgMinKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(arg_max,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgMaxKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
new file mode 100644
index 00000000000000..15bca474f58c38
--- /dev/null
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -0,0 +1,217 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef __HIPCC__
+namespace rocprim {
+namespace detail {
+template <>
+struct radix_key_codec_base<phi::dtype::float16>
+    : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+}  // namespace detail
+}  // namespace rocprim
+#else
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<phi::dtype::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+}  // namespace cub
+#endif
+
+namespace phi {
+
+template <typename T, typename IndType>
+static __global__ void FillFlattenGrad(const T* dO,
+                                       const IndType* indices,
+                                       int64_t size,
+                                       T* dX) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = index; i < size; i += stride) {
+    dX[indices[i]] = dO[i];
+  }
+}
+
+template <typename T, typename IndType>
+static __global__ void FillGrad(const T* dO,
+                                const IndType* indices,
+                                T* dX,
+                                IndType num_rows,
+                                IndType num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (IndType j = row_id; j < num_rows; j += gridDim.x) {
+    for (IndType i = col_id; i < num_cols; i += blockDim.x) {
+      dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i];
+    }
+  }
+}
+
+template <typename T, typename IndType>
+void ArgFullAssign(const phi::GPUContext& ctx,
+                   const DenseTensor* dO,
+                   const DenseTensor* indices,
+                   DenseTensor* dX,
+                   const IndType num_rows,
+                   const IndType num_cols) {
+  auto cu_stream = ctx.stream();
+
+  auto ComputeBlockSize = [](IndType col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+
+  int block_size = ComputeBlockSize(num_cols);
+
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
+  // actually, int num_rows < max_grid_size
+  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
+  FillGrad<<<grid_size, block_size, 0, cu_stream>>>(dO->data<T>(),
+                                                    indices->data<IndType>(),
+                                                    dX->data<T>(),
+                                                    num_rows,
+                                                    num_cols);
+}
+
+template <typename T>
+void ArgFlattenAssign(const phi::GPUContext& ctx,
+                      const DenseTensor* dO,
+                      const DenseTensor* indices,
+                      int64_t size,
+                      DenseTensor* dX) {
+  auto cu_stream = ctx.stream();
+
+  const int64_t block_size =
+      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
+  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int64_t max_blocks =
+      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+  const int64_t grid_size =
+      std::min(max_blocks, (size + block_size - 1) / block_size);
+
+  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
+      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
+}
+
+template <typename T, typename Context>
+void ArgsortGradKernel(const Context& dev_ctx,
+                       const DenseTensor& indices,
+                       const DenseTensor& input,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       bool descending,
+                       DenseTensor* in_grad) {
+  dev_ctx.template Alloc<T>(in_grad);
+  if (out_grad.numel() == 0) return;
+  auto in_dims = in_grad->dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  int64_t size = in_grad->numel();
+
+  // Parallel acceleration when the input size is equal to the length of the
+  // ‘axis’ dimension.
+  // Compared to 'special case for full sort' below, the gradient calculation
+  // is 10 times faster.
+  if (size == in_dims[axis]) {
+    ArgFlattenAssign<T>(dev_ctx, &out_grad, &indices, size, in_grad);
+    return;
+  }
+
+  // Special case for full sort, speedup ~190x.
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    ArgFullAssign<T, int64_t>(
+        dev_ctx, &out_grad, &indices, in_grad, input_height, input_width);
+  } else {
+    // if not full sort, do transpose first
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
+    TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&tmp_out);
+
+    ArgFullAssign<T, int64_t>(
+        dev_ctx, &trans_dO, &trans_ind, &tmp_out, input_height, input_width);
+
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, in_grad);
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(argsort_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
new file mode 100644
index 00000000000000..6a9c1e275998b8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -0,0 +1,310 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/argsort_kernel.h"
+
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef __HIPCC__
+namespace rocprim {
+namespace detail {
+template <>
+struct radix_key_codec_base<phi::dtype::float16>
+    : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+}  // namespace detail
+}  // namespace rocprim
+#else
+// set cub base traits in order to handle float16
+namespace cub {
+template <>
+struct NumericTraits<phi::dtype::float16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+}  // namespace cub
+#endif
+
+namespace phi {
+
+// Iter for move to next row
+struct SegmentOffsetIter {
+  EIGEN_DEVICE_FUNC
+  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+template <typename T>
+static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
+  int col_id = threadIdx.x;
+  int row_id = blockIdx.x;
+
+  for (T j = row_id; j < num_rows; j += gridDim.x) {
+    for (T i = col_id; i < num_cols; i += blockDim.x) {
+      indices[j * num_cols + i] = i;
+    }
+  }
+}
+
+// Sort by flag descending, True: descending. False: Ascending.
+// Default is false.
+template <typename T, typename IndType>
+void ArgFullSort(const phi::GPUContext& ctx,
+                 const DenseTensor* input,
+                 DenseTensor* output,
+                 DenseTensor* indices,
+                 const IndType num_rows,
+                 const IndType num_cols,
+                 const bool descending) {
+  auto cu_stream = ctx.stream();
+  DenseTensor input_indices;
+  const std::vector<IndType> dims = {num_rows, num_cols};
+  auto dim = phi::make_ddim(dims);
+  input_indices.Resize(dim);
+  ctx.template Alloc<IndType>(&input_indices);
+  size_t temp_storage_bytes = -1;
+
+  auto ComputeBlockSize = [](IndType col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+
+  int block_size = ComputeBlockSize(num_cols);
+  int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
+  // actually, int num_rows < max_grid_size
+  int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX;
+  // Init a index array
+  FillIndex<<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<IndType>(), num_rows, num_cols);
+
+  T* sorted_out_ptr;
+  IndType* sorted_indices_ptr;
+  const T* inp = input->data<T>();
+  T* out = ctx.template Alloc<T>(output);
+  IndType* ind = ctx.template Alloc<IndType>(indices);
+  sorted_out_ptr = out;
+  sorted_indices_ptr = ind;
+
+  // create iter for counting input
+  cub::CountingInputIterator<IndType> counting_iter(0);
+  // segment_offset is used for move to next row
+  cub::TransformInputIterator<IndType,
+                              SegmentOffsetIter,
+                              cub::CountingInputIterator<IndType>>
+      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
+
+  gpuError_t err;
+  if (descending) {
+    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        nullptr,
+        temp_storage_bytes,
+        inp,
+        sorted_out_ptr,
+        input_indices.data<IndType>(),
+        sorted_indices_ptr,
+        num_cols * num_rows,
+        num_rows,
+        segment_offsets_t,
+        segment_offsets_t + 1,
+        0,
+        sizeof(T) * 8,
+        cu_stream);
+  } else {
+    err =
+        cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
+                                                 temp_storage_bytes,
+                                                 inp,
+                                                 sorted_out_ptr,
+                                                 input_indices.data<IndType>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
+
+  DenseTensor temp_storage;
+  int64_t temp_size = temp_storage_bytes;
+  temp_storage.Resize({temp_size});
+  ctx.template Alloc<uint8_t>(&temp_storage);
+
+  if (descending) {
+    err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        temp_storage.data<uint8_t>(),
+        temp_storage_bytes,
+        inp,
+        sorted_out_ptr,
+        input_indices.data<IndType>(),
+        sorted_indices_ptr,
+        num_cols * num_rows,
+        num_rows,
+        segment_offsets_t,
+        segment_offsets_t + 1,
+        0,
+        sizeof(T) * 8,
+        cu_stream);
+  } else {
+    err =
+        cub::DeviceSegmentedRadixSort::SortPairs(temp_storage.data<uint8_t>(),
+                                                 temp_storage_bytes,
+                                                 inp,
+                                                 sorted_out_ptr,
+                                                 input_indices.data<IndType>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+  }
+
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
+}
+
+template <typename T, typename Context>
+void ArgsortKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int axis,
+                   bool descending,
+                   DenseTensor* output,
+                   DenseTensor* indices) {
+  auto in_dims = input.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  const T* in_data = input.data<T>();
+  auto size = input.numel();
+  T* out_data = dev_ctx.template Alloc<T>(output);
+  int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  // Use thrust for parallel acceleration when the input size is equal to the
+  // length of the ‘axis’ dimension.
+  // Compared to the following 'Special case for full sort', ascending sort is
+  // 34 times faster and descending sort is 31 times faster.
+  if (size == in_dims[axis]) {
+    thrust::sequence(thrust::device, ids_data, ids_data + size);
+    thrust::copy(thrust::device, in_data, in_data + size, out_data);
+    thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
+    if (descending) {
+      thrust::reverse(thrust::device, out_data, out_data + size);
+      thrust::reverse(thrust::device, ids_data, ids_data + size);
+    }
+    return;
+  }
+
+  // Special case for full sort, speedup ~190x.
+  if (axis == -1 || axis + 1 == in_dims.size()) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    ArgFullSort<T, int64_t>(dev_ctx,
+                            &input,
+                            output,
+                            indices,
+                            input_height,
+                            input_width,
+                            descending);
+  } else {
+    // if not full sort, do transpose first
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.push_back(i);
+    }
+    trans.push_back(axis);
+    phi::DDim trans_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+    }
+
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    T* trans_inp_data = dev_ctx.template Alloc<T>(&trans_inp);
+    // Do transpose
+    TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    // temp indices for sorting
+    tmp_indices.Resize(trans_dims);
+    dev_ctx.template Alloc<int64_t>(&tmp_indices);
+    dev_ctx.template Alloc<int64_t>(indices);
+
+    ArgFullSort<T, int64_t>(dev_ctx,
+                            &trans_inp,
+                            &tmp_out,
+                            &tmp_indices,
+                            input_height,
+                            input_width,
+                            descending);
+
+    TransposeKernel<int64_t, Context>(dev_ctx, tmp_indices, trans, indices);
+    // transpose back
+    TransposeKernel<T, Context>(dev_ctx, tmp_out, trans, output);
+    return;
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(argsort,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ArgsortKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 6ad12245d2a45a..49b550f51e60e1 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -460,10 +460,14 @@ void BatchNormKernel(const Context &ctx,
       void *reserve_space_ptr = nullptr;
       void *workspace_ptr = nullptr;
       DenseTensor workspace_tensor;
+      DenseTensor reserve_space_tensor;
       // Create reserve space and workspace for batch norm.
       // Create tensor for each batchnorm op, it will be used in the
       // backward. Thus this tensor shouldn't be temp.
       // auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+      if (reserve_space == nullptr) {
+        reserve_space = &reserve_space_tensor;
+      }
       PADDLE_ENFORCE_NOT_NULL(
           reserve_space,
           phi::errors::NotFound(
diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
new file mode 100644
index 00000000000000..a4ec894790cd38
--- /dev/null
+++ b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -0,0 +1,164 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bincount_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T, typename InputT, typename OutT>
+__global__ void KernelBincount(const InputT* input,
+                               const int total_elements,
+                               const bool has_weights,
+                               const T* weights,
+                               OutT* output) {
+  if (!has_weights) {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]], 1L);
+    }
+  } else {
+    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&output[input[i]],
+                                      static_cast<OutT>(weights[i]));
+    }
+  }
+}
+
+template <typename Context, typename T, typename InputT>
+void BincountCUDAInner(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const paddle::optional<const DenseTensor&> weights,
+                       int minlength,
+                       DenseTensor* out) {
+  const DenseTensor* input = &x;
+  DenseTensor* output = out;
+  const InputT* input_data = input->data<InputT>();
+
+  const int input_numel = input->numel();
+
+  if (input_data == nullptr) {
+    phi::DDim out_dim{0};
+    output->Resize(out_dim);
+    dev_ctx.template Alloc<T>(output);
+    return;
+  }
+  auto input_x = EigenVector<InputT>::Flatten(*input);
+  DenseTensor input_min_t, input_max_t;
+  input_max_t.Resize({1});
+  auto* input_max_data = dev_ctx.template Alloc<InputT>(&input_max_t);
+  input_min_t.Resize({1});
+  auto* input_min_data = dev_ctx.template Alloc<InputT>(&input_min_t);
+
+  auto input_max_scala = EigenScalar<InputT>::From(input_max_t);
+  auto input_min_scala = EigenScalar<InputT>::From(input_min_t);
+
+  auto* place = dev_ctx.eigen_device();
+  input_max_scala.device(*place) = input_x.maximum();
+  input_min_scala.device(*place) = input_x.minimum();
+
+  DenseTensor input_min_cpu, input_max_cpu;
+  paddle::framework::TensorCopySync(
+      input_max_t, phi::CPUPlace(), &input_max_cpu);
+  paddle::framework::TensorCopySync(
+      input_min_t, phi::CPUPlace(), &input_min_cpu);
+
+  InputT input_min = input_min_cpu.data<InputT>()[0];
+
+  PADDLE_ENFORCE_GE(
+      input_min,
+      static_cast<InputT>(0),
+      phi::errors::InvalidArgument(
+          "The elements in input tensor must be non-negative ints"));
+
+  int64_t output_size =
+      static_cast<int64_t>(input_max_cpu.data<InputT>()[0]) + 1L;
+
+  output_size = std::max(output_size, static_cast<int64_t>(minlength));
+  phi::DDim out_dim{output_size};
+  output->Resize(out_dim);
+
+  bool has_weights = weights.is_initialized();
+
+  const T* weights_data = has_weights ? weights->data<T>() : nullptr;
+  auto stream = dev_ctx.stream();
+
+  if (!has_weights) {
+    int64_t* output_data = dev_ctx.template Alloc<int64_t>(output);
+    phi::funcs::SetConstant<Context, int64_t>()(dev_ctx, output, 0L);
+
+    KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS,
+                                         0,
+                                         stream>>>(
+        input_data, input_numel, has_weights, weights_data, output_data);
+  } else {
+    const auto& weights_type =
+        paddle::framework::TransToProtoVarType(weights->dtype());
+
+    if (weights->dtype() == DataType::FLOAT32) {
+      float* output_data = dev_ctx.template Alloc<float>(output);
+      phi::funcs::SetConstant<Context, float>()(
+          dev_ctx, output, static_cast<float>(0));
+
+      KernelBincount<T, InputT, float><<<GET_BLOCKS(input_numel),
+                                         PADDLE_CUDA_NUM_THREADS,
+                                         0,
+                                         stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    } else {
+      double* output_data = dev_ctx.template Alloc<double>(output);
+      phi::funcs::SetConstant<Context, double>()(
+          dev_ctx, output, static_cast<double>(0));
+      KernelBincount<T, InputT, double><<<GET_BLOCKS(input_numel),
+                                          PADDLE_CUDA_NUM_THREADS,
+                                          0,
+                                          stream>>>(
+          input_data, input_numel, has_weights, weights_data, output_data);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BincountKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<const DenseTensor&> weights,
+                    int minlength,
+                    DenseTensor* out) {
+  if (x.dtype() == DataType::INT32) {
+    BincountCUDAInner<Context, T, int>(dev_ctx, x, weights, minlength, out);
+  } else if (x.dtype() == DataType::INT64) {
+    BincountCUDAInner<Context, T, int64_t>(dev_ctx, x, weights, minlength, out);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bincount,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BincountKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index 926dffc7450dc6..d4850b74477d29 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -87,13 +87,12 @@ void BroadcastTensorsGradKernel(const Context& ctx,
           *input_tensor, ctx.GetPlace(), ctx, output_tensor);
     } else {
       // reduce_sum implementation on CUDA
-      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
           ctx,
           *input_tensor,
           output_tensor,
           kps::IdentityFunctor<T>(),
-          reduce_dims_vec,
-          ctx.stream());
+          reduce_dims_vec);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu
index 9c02627e5463b1..225164687b75ca 100644
--- a/paddle/phi/kernels/gpu/compare_kernel.cu
+++ b/paddle/phi/kernels/gpu/compare_kernel.cu
@@ -80,8 +80,8 @@ inline void CompareAllKernelImpl(const Context& ctx,
   for (int i = 0; i < reduce_dims.size(); ++i) {
     reduce_dims[i] = i;
   }
-  funcs::TensorReduceImpl<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
-      ctx, tmp, out, kps::IdentityFunctor<bool>(), reduce_dims, ctx.stream());
+  funcs::ReduceKernel<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
+      ctx, tmp, out, kps::IdentityFunctor<bool>(), reduce_dims);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/dist_grad_kernel.cu b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
new file mode 100644
index 00000000000000..c458f8cce3e0a1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dist_grad_kernel.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/dist_grad_kernel.h"
+#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float) {}
+#else
+PD_REGISTER_KERNEL(
+    dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
+#endif
diff --git a/paddle/fluid/operators/dist_op.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
similarity index 51%
rename from paddle/fluid/operators/dist_op.cu
rename to paddle/phi/kernels/gpu/dist_kernel.cu
index 90674969e283f1..87e75e02754a8a 100644
--- a/paddle/fluid/operators/dist_op.cu
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,21 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/dist_op.h"
+#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-namespace ops = paddle::operators;
 #ifdef PADDLE_WITH_HIP
 // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
 // do not support double in HIPCC platform (Eigen3 to be fixed)
-REGISTER_OP_CUDA_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>);
+PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float) {}
 #else
-REGISTER_OP_CUDA_KERNEL(
-    dist, ops::DistKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DistKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    dist_grad, ops::DistGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DistGradKernel<paddle::platform::CUDADeviceContext, double>);
+PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
 #endif
diff --git a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
new file mode 100644
index 00000000000000..94d4942a41878f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dropout_impl.cu.h"
+#include "paddle/phi/kernels/dropout_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad) {
+  x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  auto size = x_grad->numel();
+  paddle::operators::DropoutGradGPUKernelDriver<T>(
+      dev_ctx, mode, p, out_grad, mask, size, x_grad, is_test);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu
new file mode 100644
index 00000000000000..bd1683ad0c7d8c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dropout_impl.cu.h"
+#include "paddle/phi/kernels/dropout_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DropoutRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      paddle::optional<const DenseTensor&> seed_tensor,
+                      float p,
+                      bool is_test,
+                      const std::string& mode,
+                      int seed,
+                      bool fix_seed,
+                      DenseTensor* out,
+                      DenseTensor* mask) {
+  out->mutable_data<T>(dev_ctx.GetPlace());
+  float dropout_prob = p;
+  bool upscale_in_train = (mode == "upscale_in_train");
+  mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+
+  paddle::operators::DropoutFwGPUKernelDriver<T>(dev_ctx,
+                                                 is_test,
+                                                 mode,
+                                                 dropout_prob,
+                                                 upscale_in_train,
+                                                 fix_seed,
+                                                 seed,
+                                                 x,
+                                                 seed_tensor.get_ptr(),
+                                                 mask,
+                                                 out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dropout,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
new file mode 100644
index 00000000000000..fdf61dc73991d8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigh_grad_kernel.h"
+#include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+PD_REGISTER_KERNEL(eigh_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EighGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/eigh_kernel.cu b/paddle/phi/kernels/gpu/eigh_kernel.cu
new file mode 100644
index 00000000000000..4ff3b371b6a01a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/eigh_kernel.cu
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/phi/kernels/eigh_kernel.h"
+#include "paddle/phi/kernels/funcs/values_vectors_functor.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::string& uplo,
+                DenseTensor* out_w,
+                DenseTensor* out_v) {
+  bool is_lower = (uplo == "L");
+  phi::funcs::MatrixEighFunctor<Context, T> functor;
+  functor(dev_ctx, x, out_w, out_v, is_lower, true);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(eigh,  // cuda_only
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EighKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index 20799f4e37b3bd..e5432b5f9187ca 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -14,12 +14,96 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace phi {
 
+template <typename T>
+void ReduceWrapper(const GPUContext &dev_ctx,
+                   int axis,
+                   DenseTensor *src,
+                   DenseTensor *dst) {
+  std::vector<int> reduce_dims =
+      funcs::GetReduceDim(dst->dims(), src->dims(), axis);
+  funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims);
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void GetGradXAndYOut(const GPUContext &dev_ctx,
+                     const Place &place,
+                     int axis,
+                     std::vector<const DenseTensor *> ins,
+                     const DenseTensor &dout,
+                     DenseTensor *dx,
+                     DenseTensor *dy,
+                     Functor func) {
+  DenseTensor tmp_dx;
+  DenseTensor tmp_dy;
+  dev_ctx.Alloc<T>(dx);
+  dev_ctx.Alloc<T>(dy);
+  std::vector<DenseTensor *> outs;
+  if (dx->dims() == dout.dims() && dy->dims() == dout.dims()) {
+    outs = {dx, dy};
+  } else if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) {
+    tmp_dx.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dx);
+    outs = {&tmp_dx, dy};
+  } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) {
+    tmp_dy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dy);
+    outs = {dx, &tmp_dy};
+  } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) {
+    tmp_dy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dy);
+    tmp_dx.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dx);
+    outs = {&tmp_dx, &tmp_dy};
+  }
+
+  funcs::BroadcastKernel<ET, T, T, decltype(func), 2>(
+      dev_ctx, ins, &outs, axis, func);
+
+  if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
+  } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
+  } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
+  }
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void GetGradXOrYOut(const GPUContext &dev_ctx,
+                    const Place &place,
+                    int axis,
+                    std::vector<const DenseTensor *> ins,
+                    const DenseTensor &dout,
+                    DenseTensor *dxy,
+                    Functor func) {
+  DenseTensor tmp_dxy;
+  dev_ctx.Alloc<T>(dxy);
+
+  std::vector<DenseTensor *> outs;
+  if (dxy->dims() != dout.dims()) {
+    tmp_dxy.Resize(dout.dims());
+    dev_ctx.Alloc<T>(&tmp_dxy);
+    outs = {&tmp_dxy};
+  } else {
+    outs = {dxy};
+  }
+
+  funcs::BroadcastKernel<ET, T, T>(dev_ctx, ins, &outs, axis, func);
+  if (dxy->dims() != dout.dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
+  }
+}
+
 /*
 ******************************
     Add Grad
@@ -83,9 +167,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims);
     }
   }
   // dy
@@ -98,9 +181,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx,
     } else {
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims);
     }
   }
 }
@@ -196,9 +278,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims);
     }
   }
   // dy
@@ -217,9 +298,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
     } else {
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
-          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
+      funcs::ReduceKernel<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
+          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims);
     }
   }
 }
@@ -243,4 +323,78 @@ void elementwise_sub_grad(const GPUContext &ctx,
       dx->mutable_data<T>(ctx.GetPlace()),
       dy->mutable_data<T>(ctx.GetPlace()));
 }
+/*
+******************************
+    Div Grad
+******************************
+*/
+template <typename T>
+void ElementwiseDivGrad(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &out,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy,
+                        int axis = -1) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &out, &y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &out, &y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
+  }
+}
+
+/*
+******************************
+    Mul Grad
+******************************
+*/
+
+template <typename T>
+void ElementwiseMulGrad(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy,
+                        int axis) {
+  const auto place = dev_ctx.GetPlace();
+
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y, &x};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::MultiplyGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::MultiplyGradFunctor<T>());
+  } else if (dx == nullptr && dy != nullptr) {
+    std::vector<const DenseTensor *> ins = {&dout, &x};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::MultiplyGradFunctor<T>());
+  }
+}
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index d00888aee67019..81f7fac1088032 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -15,9 +15,11 @@
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
@@ -102,6 +104,50 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
+  }
+}
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(add_grad,
@@ -168,3 +214,71 @@ PD_REGISTER_KERNEL(subtract_double_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index 1f756bfdbed30a..852d209ee01859 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -63,18 +63,30 @@ void FullLikeKernel(const Context& dev_ctx,
   auto value = val.to<float>();
   using CommonType = typename std::common_type<
       float,
-      typename std::conditional<std::is_same<T, phi::dtype::float16>::value,
-                                float,
-                                T>::type>::type;
+      typename std::conditional<
+          std::is_same<T, phi::dtype::float16>::value ||
+              std::is_same<T, phi::dtype::bfloat16>::value,
+          float,
+          T>::type>::type;
 
   auto common_type_value = static_cast<CommonType>(value);
 
-  PADDLE_ENFORCE_EQ(
-      (common_type_value >=
+  // Check whether the filled value is valid
+  bool is_out_range = true;
+  if (std::isinf(value) || std::isnan(value)) {
+    is_out_range = false;
+  }
+
+  if ((common_type_value >=
        static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-          (common_type_value <=
-           static_cast<CommonType>(std::numeric_limits<T>::max())),
-      true,
+      (common_type_value <=
+       static_cast<CommonType>(std::numeric_limits<T>::max()))) {
+    is_out_range = false;
+  }
+
+  PADDLE_ENFORCE_EQ(
+      is_out_range,
+      false,
       phi::errors::InvalidArgument(
           "The filled value is out of range for target type, "
           "current kernel type is %s, the range should between %f "
@@ -110,6 +122,7 @@ PD_REGISTER_KERNEL(full,
                    int64_t,
                    bool,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -123,6 +136,7 @@ PD_REGISTER_KERNEL(full_like,
                    int,
                    int64_t,
                    bool,
+                   phi::dtype::bfloat16,
                    phi::dtype::float16) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
index da16800ad02af1..e2fe2190c1ce0c 100644
--- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -18,8 +18,8 @@
 #include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
@@ -46,8 +46,9 @@ struct GaussianGenerator {
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed_);
-    using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
-    thrust::normal_distribution<MT> dist(mean_, std_);
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    thrust::normal_distribution<MT> dist(static_cast<MT>(mean_),
+                                         static_cast<MT>(std_));
     unsigned int new_n = n + offset_;
     rng.discard(new_n);
     MT out = dist(rng);
@@ -83,9 +84,10 @@ void GaussianRandomKernel(const Context& dev_ctx,
 
   if (gen_cuda->GetIsInitPy() && seed_flag) {
     if (FLAGS_use_curand) {
-      using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
+      using MT = typename phi::dtype::MPTypeTrait<T>::Type;
       funcs::normal_distribution<MT> dist;
-      funcs::normal_transform<MT> trans(mean, std);
+      funcs::normal_transform<MT> trans(static_cast<MT>(mean),
+                                        static_cast<MT>(std));
       funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
     } else {
       auto seed_offset = gen_cuda->IncrementOffset(1);
@@ -110,5 +112,6 @@ PD_REGISTER_KERNEL(gaussian_random,
                    ALL_LAYOUT,
                    phi::GaussianRandomKernel,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double) {}
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
new file mode 100644
index 00000000000000..1eab521170bc5d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
@@ -0,0 +1,171 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <algorithm>
+#include <vector>
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+
+namespace phi {
+
+template <typename T, typename IndexT>
+struct GraphSendRecvSumCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT>
+struct GraphSendRecvMaxCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT>
+struct GraphSendRecvMinCUDAFunctor {
+  DEVICE inline void operator()(const T* params,
+                                T* output,
+                                const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT, typename Functor>
+__global__ void GraphSendRecvCUDAKernel(const T* params,
+                                        const IndexT* src_indices,
+                                        const IndexT* dst_indices,
+                                        T* output,
+                                        size_t index_size,
+                                        size_t slice_size,
+                                        Functor functor) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    functor(params, output, in_i, out_i);
+  }
+}
+
+// For max
+template <typename T>
+__global__ void InputResetMaxCUDAKernel(T* output,
+                                        size_t input_size,
+                                        size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    if (*(output + i) == std::numeric_limits<T>::min()) {
+      *(output + i) = 0;
+    }
+  }
+}
+
+// For min
+template <typename T>
+__global__ void InputResetMinCUDAKernel(T* output,
+                                        size_t input_size,
+                                        size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    if (*(output + i) == std::numeric_limits<T>::max()) {
+      *(output + i) = 0;
+    }
+  }
+}
+
+// Get dst_count
+template <typename T, typename IndexT>
+__global__ void ComputeCountCUDAKernel(int32_t* count,
+                                       const IndexT* dst_indices,
+                                       size_t index_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) {
+    IndexT dst_i = dst_indices[i];
+    paddle::platform::CudaAtomicAdd(count + dst_i, 1);
+  }
+}
+
+// For forward mean
+template <typename T>
+__global__ void ManipulateMeanCUDAKernel(T* output,
+                                         int32_t* count,
+                                         size_t input_size,
+                                         size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    int64_t c_index = i / slice_size;
+    if (*(count + c_index) > 1) {
+      *(output + i) = *(output + i) / *(count + c_index);
+    }
+  }
+}
+
+// For backward mean
+template <typename T, typename IndexT>
+__global__ void ManipulateMeanGradCUDAKernel(const T* params,
+                                             const IndexT* src_indices,
+                                             const IndexT* dst_indices,
+                                             T* output,
+                                             size_t index_size,
+                                             size_t slice_size,
+                                             const int32_t* dst_count) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    paddle::platform::CudaAtomicAdd(output + out_i,
+                                    *(params + in_i) / dst_count[src_i]);
+  }
+}
+
+// For backward min and max
+template <typename T, typename IndexT>
+__global__ void ManipulateMinMaxGradCUDAKernel(const T* params,
+                                               const IndexT* src_indices,
+                                               const IndexT* dst_indices,
+                                               T* output,
+                                               size_t index_size,
+                                               size_t slice_size,
+                                               const T* ptr_input,
+                                               const T* ptr_output) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
+    paddle::platform::CudaAtomicAdd(
+        output + out_i,
+        *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i)));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
new file mode 100644
index 00000000000000..75692966b4662c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -0,0 +1,148 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvGradOpCUDAKernelLaunchHelper(
+    const Context& ctx,
+    const DenseTensor& out_grad,
+    const DenseTensor& src_index,
+    const DenseTensor& dst_index,
+    const std::string& pool_type,
+    DenseTensor* x_grad,
+    const DenseTensor* dst_count = nullptr,
+    const DenseTensor* x = nullptr,
+    const DenseTensor* out = nullptr) {
+  const int& index_size = dst_index.dims()[0];
+
+  ctx.template Alloc<T>(x_grad);
+  T* p_output = x_grad->data<T>();
+
+  const auto& src_dims = out_grad.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) {
+    memset_size *= src_dims[i];
+  }
+  const size_t& memset_bytes = memset_size * sizeof(T);
+
+#ifdef PADDLE_WITH_HIP
+  hipMemset(p_output, 0, memset_bytes);
+#else
+  cudaMemset(p_output, 0, memset_bytes);
+#endif
+
+  if (index_size == 0) return;
+
+  int64_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) {
+    slice_size *= src_dims[i];
+  }
+  const T* p_src = out_grad.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+  int block = 256;
+#else
+  int block = 1024;
+#endif
+  int64_t n = slice_size * index_size;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t grid_tmp = (n + block - 1) / block;
+  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
+  int64_t input_size = src_dims[0];
+  if (pool_type == "SUM") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, d_index, s_index, p_output, index_size, slice_size, functor);
+  } else if (pool_type == "MEAN") {
+    const int32_t* s_count = dst_count->data<int32_t>();
+    ManipulateMeanGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+        p_src, d_index, s_index, p_output, index_size, slice_size, s_count);
+  } else if (pool_type == "MAX" || pool_type == "MIN") {
+    const T* ptr_input = x->data<T>();
+    const T* ptr_output = out->data<T>();
+    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+        p_src,
+        d_index,
+        s_index,
+        p_output,
+        index_size,
+        slice_size,
+        ptr_input,
+        ptr_output);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int32_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int64_t>(
+        ctx,
+        out_grad,
+        src_index,
+        dst_index,
+        pool_type,
+        x_grad,
+        dst_count.get_ptr(),
+        x.get_ptr(),
+        out.get_ptr());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
new file mode 100644
index 00000000000000..fab306f831a6f4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
@@ -0,0 +1,179 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/graph_send_recv_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
+                                           const DenseTensor& x,
+                                           const DenseTensor& src_index,
+                                           const DenseTensor& dst_index,
+                                           const std::string& pool_type,
+                                           DenseTensor* out,
+                                           DenseTensor* dst_count = nullptr) {
+  const int& index_size = src_index.dims()[0];
+  ctx.template Alloc<T>(out);
+  T* p_output = out->data<T>();
+  const auto& src_dims = x.dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) {
+    memset_size *= src_dims[i];
+  }
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  if (pool_type == "SUM" || pool_type == "MEAN") {
+#ifdef PADDLE_WITH_HIP
+    hipMemset(p_output, 0, memset_bytes);
+#else
+    cudaMemset(p_output, 0, memset_bytes);
+#endif
+  } else if (pool_type == "MAX") {
+    thrust::device_ptr<T> p_output_ptr(p_output);
+    thrust::fill(thrust::device,
+                 p_output_ptr,
+                 p_output_ptr + memset_size,
+                 std::numeric_limits<T>::min());
+  } else if (pool_type == "MIN") {
+    thrust::device_ptr<T> p_output_ptr(p_output);
+    thrust::fill(thrust::device,
+                 p_output_ptr,
+                 p_output_ptr + memset_size,
+                 std::numeric_limits<T>::max());
+  }
+
+  if (index_size == 0) return;
+
+  int64_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) {
+    slice_size *= src_dims[i];
+  }
+  const T* p_src = x.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+  int block = 256;
+#else
+  int block = 1024;
+#endif
+  int64_t n = slice_size * index_size;
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t grid_tmp = (n + block - 1) / block;
+  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
+  int64_t input_size = src_dims[0];
+  if (pool_type == "SUM") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+  } else if (pool_type == "MAX") {
+    GraphSendRecvMaxCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvMaxCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_max =
+        grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
+    InputResetMaxCUDAKernel<T><<<grid_max, block, 0, ctx.stream()>>>(
+        p_output, input_size, slice_size);
+  } else if (pool_type == "MIN") {
+    GraphSendRecvMinCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvMinCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_min =
+        grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
+    InputResetMinCUDAKernel<T><<<grid_min, block, 0, ctx.stream()>>>(
+        p_output, input_size, slice_size);
+  } else if (pool_type == "MEAN") {
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<
+        T,
+        IndexT,
+        GraphSendRecvSumCUDAFunctor<T,
+                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
+        p_src, s_index, d_index, p_output, index_size, slice_size, functor);
+
+    ctx.template Alloc<int32_t>(dst_count);
+    int32_t* p_dst_count = dst_count->data<int32_t>();
+
+#ifdef PADDLE_WITH_HIP
+    hipMemset(p_dst_count, 0, input_size * sizeof(int));
+#else
+    cudaMemset(p_dst_count, 0, input_size * sizeof(int));
+#endif
+
+    int64_t grid_count = (index_size + block - 1) / block;
+    ComputeCountCUDAKernel<T, IndexT><<<grid_count, block, 0, ctx.stream()>>>(
+        p_dst_count, d_index, index_size);
+
+    int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_mean =
+        grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx;
+    ManipulateMeanCUDAKernel<T><<<grid_mean, block, 0, ctx.stream()>>>(
+        p_output, p_dst_count, input_size, slice_size);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_recv,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendRecvKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/isfinite_kernel.cu b/paddle/phi/kernels/gpu/isfinite_kernel.cu
new file mode 100644
index 00000000000000..4b41ed1e55d39b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/isfinite_kernel.cu
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h"
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsinfKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsnanKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu
new file mode 100644
index 00000000000000..3a6ff365c11db8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/linspace_kernel.cu
@@ -0,0 +1,97 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/linspace_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void LinspaceKernelInner(
+    T start, T stop, double step, int64_t size, T* out) {
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; index < size; index += blockDim.x * gridDim.x) {
+    if (index < size / 2) {
+      out[index] = static_cast<T>(start + step * index);
+    } else {
+      out[index] = static_cast<T>(stop - step * (size - index - 1));
+    }
+  }
+}
+
+template <typename T>
+__global__ void LinspaceSpecialKernel(T start, T* out) {
+  out[0] = static_cast<T>(start);
+}
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out) {
+  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
+  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
+
+  DenseTensor n_start;
+  DenseTensor n_stop;
+  DenseTensor n_num;
+  phi::Copy(ctx, start_t, phi::CPUPlace(), false, &n_start);
+  T start_data = n_start.data<T>()[0];
+  phi::Copy(ctx, stop_t, phi::CPUPlace(), false, &n_stop);
+  T stop_data = n_stop.data<T>()[0];
+  phi::Copy(ctx, number, phi::CPUPlace(), false, &n_num);
+  int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
+
+  PADDLE_ENFORCE_GT(
+      num,
+      0,
+      phi::errors::InvalidArgument("The num of linspace op should be larger "
+                                   "than 0, but received num is %d",
+                                   num));
+
+  out->Resize(phi::make_ddim({num}));
+  T* out_data = ctx.template Alloc<T>(out);
+
+  double step = 0;
+  auto stream = ctx.stream();
+  int block = 512;
+  int grid = (num + block - 1) / block;
+  if (num != 1) {
+    step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    LinspaceKernelInner<T><<<grid, block, 0, stream>>>(
+        start_data, stop_data, step, num, out_data);
+  } else {
+    LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start_data, out_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(linspace,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LinspaceKernel,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
new file mode 100644
index 00000000000000..25a9de8f8bed42
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(matrix_power_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatrixPowerGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_power_kernel.cu b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
new file mode 100644
index 00000000000000..d7ae7d8a3f745c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/matrix_power_kernel.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/matrix_power_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matrix_power_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    matrix_power, GPU, ALL_LAYOUT, phi::MatrixPowerKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
new file mode 100644
index 00000000000000..86ff09fd74b065
--- /dev/null
+++ b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    maxout_grad, GPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/maxout_kernel.cu b/paddle/phi/kernels/gpu/maxout_kernel.cu
new file mode 100644
index 00000000000000..88776a49f19b2b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/maxout_kernel.cu
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/maxout_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(maxout, GPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
new file mode 100644
index 00000000000000..6761d945e952eb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_KERNEL(multi_dot_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiDotGradKernel,
+                   float,
+                   double,
+                   float16) {}
diff --git a/paddle/phi/kernels/gpu/multi_dot_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
new file mode 100644
index 00000000000000..60b1fce5ddd890
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multi_dot_kernel.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
+#include "paddle/phi/kernels/multi_dot_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_KERNEL(
+    multi_dot, GPU, ALL_LAYOUT, phi::MultiDotKernel, float, double, float16) {}
diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/phi/kernels/gpu/nll_loss.h
similarity index 50%
rename from paddle/fluid/operators/nll_loss_op.cu
rename to paddle/phi/kernels/gpu/nll_loss.h
index fd8a44cc05d7c8..a457264498feb2 100644
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/phi/kernels/gpu/nll_loss.h
@@ -1,37 +1,39 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <thrust/functional.h>
 #include <algorithm>
 #include <functional>
 #include <string>
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/operators/nll_loss_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
+namespace phi {
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 static const int NTHREADS = 32;
-
 static inline int NumBlocks(const int N) {
   return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
                   kNumMaxinumNumBlocks);
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
+__global__ void GPUNLLLossForward1D_no_reduce(T* out_data,
+                                              const T* x_data,
                                               const int64_t* label_data,
                                               const T* weight_data,
                                               const int64_t batch_size,
@@ -51,11 +53,15 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward1D_with_reduce(
-    T* out_data, T* total_weight_data, const T* x_data,
-    const int64_t* label_data, const T* weight_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t size_average,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossForward1D_with_reduce(T* out_data,
+                                                T* total_weight_data,
+                                                const T* x_data,
+                                                const int64_t* label_data,
+                                                const T* weight_data,
+                                                const int64_t batch_size,
+                                                const int64_t n_classes,
+                                                const int64_t size_average,
+                                                const int64_t ignore_index) {
   __shared__ T sharedInputs[NTHREADS], sharedWeights[NTHREADS];
   sharedInputs[threadIdx.x] = 0;
   sharedWeights[threadIdx.x] = 0;
@@ -99,9 +105,11 @@ __global__ void GPUNLLLossForward1D_with_reduce(
 // then __syncthreads is needed either before or afterwards to prevent non-0
 // threads overriding smem in the next loop before num-0 thread reads from it.
 template <typename T, typename ReduceOp, int N>
-__device__ void reduceNValuesInBlock(T* smem, T threadVals[N],
+__device__ void reduceNValuesInBlock(T* smem,
+                                     T threadVals[N],
                                      const unsigned int numVals,
-                                     ReduceOp reduceOp, T init) {
+                                     ReduceOp reduceOp,
+                                     T init) {
   if (numVals == 0) {
 #pragma unroll
     for (int i = 0; i < N; ++i) {
@@ -175,18 +183,26 @@ __device__ void reduceNValuesInBlock(T* smem, T threadVals[N],
 // then __syncthreads is needed either before or afterwards to prevent non-0
 // threads overriding smem in the next loop before num-0 thread reads from it.
 template <typename T, typename ReduceOp>
-__device__ T reduceBlock(T* smem, const unsigned int numVals, T threadVal,
-                         ReduceOp reduceOp, T init) {
-  reduceNValuesInBlock<T, ReduceOp, 1>(smem, &threadVal, numVals, reduceOp,
-                                       init);
+__device__ T reduceBlock(T* smem,
+                         const unsigned int numVals,
+                         T threadVal,
+                         ReduceOp reduceOp,
+                         T init) {
+  reduceNValuesInBlock<T, ReduceOp, 1>(
+      smem, &threadVal, numVals, reduceOp, init);
   return threadVal;
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward2D_no_reduce(
-    T* out_data, const T* x_data, const int64_t* label_data,
-    const T* weight_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t in_dim2, const int64_t in_dim3, const int64_t ignore_index) {
+__global__ void GPUNLLLossForward2D_no_reduce(T* out_data,
+                                              const T* x_data,
+                                              const int64_t* label_data,
+                                              const T* weight_data,
+                                              const int64_t batch_size,
+                                              const int64_t n_classes,
+                                              const int64_t in_dim2,
+                                              const int64_t in_dim3,
+                                              const int64_t ignore_index) {
   const int64_t map_size = in_dim2 * in_dim3;
   const int64_t sample_size = n_classes * map_size;
   const int64_t out_numel = batch_size * map_size;
@@ -211,11 +227,16 @@ __global__ void GPUNLLLossForward2D_no_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossForward2D_with_reduce(
-    T* out_data, T* total_weight_data, const T* x_data,
-    const int64_t* label_data, const T* weight_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t map_nelem,
-    const int64_t blocks_per_sample, const int64_t ignore_index) {
+__global__ void GPUNLLLossForward2D_with_reduce(T* out_data,
+                                                T* total_weight_data,
+                                                const T* x_data,
+                                                const int64_t* label_data,
+                                                const T* weight_data,
+                                                const int64_t batch_size,
+                                                const int64_t n_classes,
+                                                const int64_t map_nelem,
+                                                const int64_t blocks_per_sample,
+                                                const int64_t ignore_index) {
   __shared__ T partial_sums[kNumCUDAThreads];
   int64_t i;
   T input_sum = 0;
@@ -228,7 +249,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
   int64_t ioffset = sample * map_nelem * n_classes;
   int64_t step = blockDim.x * blocks_per_sample;
   for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
-       i < map_nelem; i += step) {
+       i < map_nelem;
+       i += step) {
     const int64_t cur_label = label_data[toffset + i];
     if (cur_label != ignore_index) {
       PADDLE_ENFORCE(cur_label >= 0 && cur_label < n_classes,
@@ -242,8 +264,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(
   input_sum =
       reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<T>(), (T)0);
   __syncthreads();
-  acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight,
-                           thrust::plus<T>(), (T)0);
+  acc_weight = reduceBlock(
+      partial_sums, blockDim.x, acc_weight, thrust::plus<T>(), (T)0);
 
   if (threadIdx.x == 0) {
     paddle::platform::CudaAtomicAdd(total_weight_data, acc_weight);
@@ -258,12 +280,14 @@ __global__ void GPUNLLLossForward2D_size_average(T* out_data,
     *out_data /= *total_weight_data;
   }
 }
-
 template <typename T>
-__global__ void GPUNLLLossBackward1D_no_reduce(
-    T* dx_data, const int64_t* label_data, const T* weight_data,
-    const T* dout_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward1D_no_reduce(T* dx_data,
+                                               const int64_t* label_data,
+                                               const T* weight_data,
+                                               const T* dout_data,
+                                               const int64_t batch_size,
+                                               const int64_t n_classes,
+                                               const int64_t ignore_index) {
   CUDA_KERNEL_LOOP(i, batch_size) {
     const int64_t cur_label = label_data[i];
     if (cur_label == ignore_index) {
@@ -275,11 +299,15 @@ __global__ void GPUNLLLossBackward1D_no_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossBackward1D_with_reduce(
-    T* dx_data, const T* total_weight_data, const int64_t* label_data,
-    const T* weight_data, const T* dout_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t size_average,
-    const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward1D_with_reduce(T* dx_data,
+                                                 const T* total_weight_data,
+                                                 const int64_t* label_data,
+                                                 const T* weight_data,
+                                                 const T* dout_data,
+                                                 const int64_t batch_size,
+                                                 const int64_t n_classes,
+                                                 const int64_t size_average,
+                                                 const int64_t ignore_index) {
   if (*total_weight_data <= 0) {
     return;
   }
@@ -295,10 +323,15 @@ __global__ void GPUNLLLossBackward1D_with_reduce(
 }
 
 template <typename T>
-__global__ void GPUNLLLossBackward2D_no_reduce(
-    T* dx_data, const int64_t* label_data, const T* weight_data,
-    const T* dout_data, const int64_t batch_size, const int64_t n_classes,
-    const int64_t in_dim2, const int64_t in_dim3, const int64_t ignore_index) {
+__global__ void GPUNLLLossBackward2D_no_reduce(T* dx_data,
+                                               const int64_t* label_data,
+                                               const T* weight_data,
+                                               const T* dout_data,
+                                               const int64_t batch_size,
+                                               const int64_t n_classes,
+                                               const int64_t in_dim2,
+                                               const int64_t in_dim3,
+                                               const int64_t ignore_index) {
   const int64_t map_size = in_dim2 * in_dim3;
   const int64_t sample_size = n_classes * map_size;
   const int64_t out_numel = batch_size * map_size;
@@ -319,10 +352,16 @@ __global__ void GPUNLLLossBackward2D_no_reduce(
 
 template <typename T>
 __global__ void GPUNLLLossBackward2D_with_reduce(
-    T* dx_data, const T* total_weight_data, const int64_t* label_data,
-    const T* weight_data, const T* dout_data, const int64_t batch_size,
-    const int64_t n_classes, const int64_t map_nelem,
-    const int64_t blocks_per_sample, const int64_t size_average,
+    T* dx_data,
+    const T* total_weight_data,
+    const int64_t* label_data,
+    const T* weight_data,
+    const T* dout_data,
+    const int64_t batch_size,
+    const int64_t n_classes,
+    const int64_t map_nelem,
+    const int64_t blocks_per_sample,
+    const int64_t size_average,
     const int64_t ignore_index) {
   if (*total_weight_data <= 0) {
     return;
@@ -334,7 +373,8 @@ __global__ void GPUNLLLossBackward2D_with_reduce(
   int toffset = sample * map_nelem;
   int ioffset = sample * map_nelem * n_classes;
   for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
-       i < map_nelem; i += step) {
+       i < map_nelem;
+       i += step) {
     const int64_t cur_label = label_data[toffset + i];
     if (cur_label != ignore_index) {
       dx_data[ioffset + i + map_nelem * cur_label] =
@@ -343,158 +383,4 @@ __global__ void GPUNLLLossBackward2D_with_reduce(
   }
 }
 
-template <typename DeviceContext, typename T>
-class NLLLossCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* total_weight = ctx.Output<Tensor>("Total_weight");
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    auto x_data = x->data<T>();
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto total_weight_data = total_weight->mutable_data<T>(ctx.GetPlace());
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-#ifdef PADDLE_WITH_HIP
-    hipMemset(total_weight_data, 0, sizeof(T));
-#else
-    cudaMemset(total_weight_data, 0, sizeof(T));
-#endif
-    auto x_dims = x->dims();
-    auto batch_size = x_dims[0];
-    auto n_classes = x_dims[1];
-    int64_t size_average = (int64_t)(reduction == "mean");
-
-    if (x_dims.size() == 2) {
-      int blocks = NumBlocks(batch_size);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossForward1D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, x_data, label_data, weight_data, batch_size, n_classes,
-            ignore_index);
-      } else {
-        GPUNLLLossForward1D_with_reduce<
-            T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-            out_data, total_weight_data, x_data, label_data, weight_data,
-            batch_size, n_classes, size_average, ignore_index);
-      }
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      const auto map_size = in_dim2 * in_dim3;
-      const auto out_numel = batch_size * in_dim2 * in_dim3;
-      int blocks = NumBlocks(out_numel);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossForward2D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, x_data, label_data, weight_data, batch_size, n_classes,
-            in_dim2, in_dim3, ignore_index);
-      } else {
-        int blocks_per_sample = NumBlocks(map_size) / 128;
-        blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
-        int total_blocks = blocks_per_sample * batch_size;
-        GPUNLLLossForward2D_with_reduce<
-            T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(
-            out_data, total_weight_data, x_data, label_data, weight_data,
-            batch_size, n_classes, map_size, blocks_per_sample, ignore_index);
-        if (size_average) {
-          GPUNLLLossForward2D_size_average<T><<<1, 1, 0, dev_ctx.stream()>>>(
-              out_data, total_weight_data);
-        }
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NLLLossGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* total_weight = ctx.Input<Tensor>("Total_weight");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto dout_data = dout->data<T>();
-    auto label_data = labels->data<int64_t>();
-    auto weight_data = weight ? weight->data<T>() : nullptr;
-    auto total_weight_data = total_weight->data<T>();
-    auto ignore_index = ctx.Attr<int64_t>("ignore_index");
-    auto reduction = ctx.Attr<std::string>("reduction");
-#ifdef PADDLE_WITH_HIP
-    hipMemset(dx_data, 0, dx->numel() * sizeof(T));
-#else
-    cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
-#endif
-
-    int64_t size_average = (int64_t)(reduction == "mean");
-    auto x_dims = x->dims();
-    auto batch_size = x_dims[0];
-    auto n_classes = x_dims[1];
-
-    if (x_dims.size() == 2) {
-      int blocks = NumBlocks(batch_size);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossBackward1D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, label_data, weight_data, dout_data, batch_size, n_classes,
-            ignore_index);
-      } else {
-        GPUNLLLossBackward1D_with_reduce<
-            T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
-            dx_data, total_weight_data, label_data, weight_data, dout_data,
-            batch_size, n_classes, size_average, ignore_index);
-      }
-    } else if (x_dims.size() == 4) {
-      const auto in_dim2 = x_dims[2];
-      const auto in_dim3 = x_dims[3];
-      const auto map_size = in_dim2 * in_dim3;
-      const auto out_numel = batch_size * in_dim2 * in_dim3;
-
-      int blocks = NumBlocks(out_numel);
-      int threads = kNumCUDAThreads;
-      auto& dev_ctx = ctx.cuda_device_context();
-      if (reduction == "none") {
-        GPUNLLLossBackward2D_no_reduce<
-            T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, label_data, weight_data, dout_data, batch_size, n_classes,
-            in_dim2, in_dim3, ignore_index);
-      } else {
-        int blocks_per_sample = NumBlocks(map_size) / 128;
-        blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
-        int total_blocks = blocks_per_sample * batch_size;
-        GPUNLLLossBackward2D_with_reduce<
-            T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(
-            dx_data, total_weight_data, label_data, weight_data, dout_data,
-            batch_size, n_classes, map_size, blocks_per_sample, size_average,
-            ignore_index);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    nll_loss,
-    ops::NLLLossCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NLLLossCUDAKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    nll_loss_grad,
-    ops::NLLLossGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NLLLossGradCUDAKernel<paddle::platform::CUDADeviceContext, double>);
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
new file mode 100644
index 00000000000000..9a2d9c6e479aa4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -0,0 +1,114 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/nll_loss.h"
+
+namespace phi {
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& labels,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& dout,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* dx) {
+  auto dx_data = dev_ctx.template Alloc<T>(dx);
+  auto dout_data = dout.data<T>();
+  auto label_data = labels.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+  auto total_weight_data = total_weight.data<T>();
+#ifdef PADDLE_WITH_HIP
+  hipMemset(dx_data, 0, dx->numel() * sizeof(T));
+#else
+  cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
+#endif
+
+  int64_t size_average = (int64_t)(reduction == "mean");
+  auto x_dims = x.dims();
+  auto batch_size = x_dims[0];
+  auto n_classes = x_dims[1];
+
+  if (x_dims.size() == 2) {
+    int blocks = NumBlocks(batch_size);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossBackward1D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       dout_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       ignore_index);
+    } else {
+      GPUNLLLossBackward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
+          dx_data,
+          total_weight_data,
+          label_data,
+          weight_data,
+          dout_data,
+          batch_size,
+          n_classes,
+          size_average,
+          ignore_index);
+    }
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    const auto map_size = in_dim2 * in_dim3;
+    const auto out_numel = batch_size * in_dim2 * in_dim3;
+
+    int blocks = NumBlocks(out_numel);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossBackward2D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       dout_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       in_dim2,
+                                                       in_dim3,
+                                                       ignore_index);
+    } else {
+      int blocks_per_sample = NumBlocks(map_size) / 128;
+      blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+      int total_blocks = blocks_per_sample * batch_size;
+      GPUNLLLossBackward2D_with_reduce<
+          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(dx_data,
+                                                             total_weight_data,
+                                                             label_data,
+                                                             weight_data,
+                                                             dout_data,
+                                                             batch_size,
+                                                             n_classes,
+                                                             map_size,
+                                                             blocks_per_sample,
+                                                             size_average,
+                                                             ignore_index);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss_grad, GPU, ALL_LAYOUT, phi::NllLossGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
new file mode 100644
index 00000000000000..6b0e1fef7ba9a5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/nll_loss.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const DenseTensor& label,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight) {
+  auto* x = &input;
+  auto x_data = x->data<T>();
+  auto out_data = dev_ctx.template Alloc<T>(out);
+  auto total_weight_data = dev_ctx.template Alloc<T>(total_weight);
+  auto label_data = label.data<int64_t>();
+  auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
+#ifdef PADDLE_WITH_HIP
+  hipMemset(total_weight_data, 0, sizeof(T));
+#else
+  cudaMemset(total_weight_data, 0, sizeof(T));
+#endif
+  auto x_dims = x->dims();
+  auto batch_size = x_dims[0];
+  auto n_classes = x_dims[1];
+  int64_t size_average = (int64_t)(reduction == "mean");
+
+  if (x_dims.size() == 2) {
+    int blocks = NumBlocks(batch_size);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossForward1D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                       x_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       ignore_index);
+    } else {
+      GPUNLLLossForward1D_with_reduce<T><<<1, NTHREADS, 0, dev_ctx.stream()>>>(
+          out_data,
+          total_weight_data,
+          x_data,
+          label_data,
+          weight_data,
+          batch_size,
+          n_classes,
+          size_average,
+          ignore_index);
+    }
+  } else if (x_dims.size() == 4) {
+    const auto in_dim2 = x_dims[2];
+    const auto in_dim3 = x_dims[3];
+    const auto map_size = in_dim2 * in_dim3;
+    const auto out_numel = batch_size * in_dim2 * in_dim3;
+    int blocks = NumBlocks(out_numel);
+    int threads = kNumCUDAThreads;
+    if (reduction == "none") {
+      GPUNLLLossForward2D_no_reduce<
+          T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                       x_data,
+                                                       label_data,
+                                                       weight_data,
+                                                       batch_size,
+                                                       n_classes,
+                                                       in_dim2,
+                                                       in_dim3,
+                                                       ignore_index);
+    } else {
+      int blocks_per_sample = NumBlocks(map_size) / 128;
+      blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+      int total_blocks = blocks_per_sample * batch_size;
+      GPUNLLLossForward2D_with_reduce<
+          T><<<total_blocks, threads, 0, dev_ctx.stream()>>>(out_data,
+                                                             total_weight_data,
+                                                             x_data,
+                                                             label_data,
+                                                             weight_data,
+                                                             batch_size,
+                                                             n_classes,
+                                                             map_size,
+                                                             blocks_per_sample,
+                                                             ignore_index);
+      if (size_average) {
+        GPUNLLLossForward2D_size_average<T><<<1, 1, 0, dev_ctx.stream()>>>(
+            out_data, total_weight_data);
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    nll_loss, GPU, ALL_LAYOUT, phi::NllLossRawKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
new file mode 100644
index 00000000000000..f553da361f1fe8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad) {
+  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+                    true,
+                    errors::PreconditionNotMet(
+                        "PutAlongAxisGradOpCUDAKernel only runs on GPU."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (x_grad) {
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_input_grad_kernel<T, int32_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    } else {
+      paddle::operators::gpu_scatter_input_grad_kernel<T, int64_t>(
+          out_grad, axis, index, *x_grad, dev_ctx);
+    }
+  }
+  if (value_grad) {
+    value_grad->Resize(index.dims());
+    value_grad->mutable_data<T>(dev_ctx.GetPlace());
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_gather_kernel<T, int32_t>(
+          out_grad,
+          axis,
+          index,
+          *value_grad,
+          dev_ctx);  // the gradient of scatter is gather
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_gather_kernel<T, int64_t>(
+          out_grad, axis, index, *value_grad, dev_ctx);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
new file mode 100644
index 00000000000000..d363c0c28364c0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/put_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+                    true,
+                    errors::PreconditionNotMet(
+                        "PutAlongAxisCUDAKernel only runs on GPU device."));
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  if (reduce == "add") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_add_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_add_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "multiply" || reduce == "mul") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_mul_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_mul_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else if (reduce == "assign") {
+    if (index_type == paddle::framework::proto::VarType::INT32) {
+      paddle::operators::gpu_scatter_assign_kernel<T, int32_t>(
+          *out, axis, index, value, dev_ctx);
+    } else if (index_type == paddle::framework::proto::VarType::INT64) {
+      paddle::operators::gpu_scatter_assign_kernel<T, int64_t>(
+          *out, axis, index, value, dev_ctx);
+    }
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "can not support reduce: '%s' for scatter kernel, only "
+        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "defalut reduce op is 'assign' ",
+        reduce));
+    return;
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(put_along_axis,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PutAlongAxisKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 0319de7558e824..da5315f34479f9 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -39,8 +39,6 @@ void Reduce(const KPDevice& dev_ctx,
     reduce_num *= (x.dims())[i];
   }
 
-  KPStream stream = dev_ctx.stream();
-
   if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
     auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype);
     PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES(
@@ -48,29 +46,23 @@ void Reduce(const KPDevice& dev_ctx,
         phi::DataType::INT64,
         phi::DataType::FLOAT16,
         out_dtype,
-        "TensorReduceImpl",
+        "ReduceKernel",
         ([&] {
           using MPType = typename kps::details::MPTypeTrait<data_t>::Type;
-          phi::funcs::TensorReduceImpl<data_t,
-                                       data_t,
-                                       ReduceOp,
-                                       TransformOp<data_t, MPType>>(
+          phi::funcs::ReduceKernel<data_t,
+                                   data_t,
+                                   ReduceOp,
+                                   TransformOp<data_t, MPType>>(
               dev_ctx,
               tmp_tensor,
               out,
               TransformOp<data_t, MPType>(reduce_num),
-              reduce_dims,
-              stream);
+              reduce_dims);
         }));
   } else {
     using MPType = typename kps::details::MPTypeTrait<T>::Type;
-    phi::funcs::TensorReduceImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
-        dev_ctx,
-        x,
-        out,
-        TransformOp<T, MPType>(reduce_num),
-        reduce_dims,
-        stream);
+    phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>>(
+        dev_ctx, x, out, TransformOp<T, MPType>(reduce_num), reduce_dims);
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
new file mode 100644
index 00000000000000..98c3986c51dd68
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MaxFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
new file mode 100644
index 00000000000000..d9618dc159a6d3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(segment_pool_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
new file mode 100644
index 00000000000000..c38e935adf837e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/segment_pool_kernel_impl.h"
+#include "paddle/phi/kernels/segment_pool_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/set_value_kernel.cu b/paddle/phi/kernels/gpu/set_value_kernel.cu
new file mode 100644
index 00000000000000..f788da010b6827
--- /dev/null
+++ b/paddle/phi/kernels/gpu/set_value_kernel.cu
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/set_value_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
+
+PD_REGISTER_KERNEL(set_value,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
+PD_REGISTER_KERNEL(set_value_with_tensor,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetTensorValueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/gpu/shape_kernel.cu b/paddle/phi/kernels/gpu/shape_kernel.cu
new file mode 100644
index 00000000000000..39b6eaeaef2a8e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/shape_kernel.cu
@@ -0,0 +1,35 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/shape_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
+
+PD_REGISTER_KERNEL(shape,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
index ae3cefd9e82d0c..f61cd2c39674ec 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -69,17 +69,12 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
   dev_ctx.template Alloc<T>(counts_tensor);
   counts_tensor->Resize(in_grad->dims());
 
-  int limit = in_grad->numel();
-  int blocks = NumBlocks(limit);
-  int threads = kNumCUDAThreads;
   std::vector<const DenseTensor *> ins = {&x, &label, &out_grad};
   std::vector<DenseTensor *> outs = {in_grad, counts_tensor};
   auto functor = SigmoidBwdFunctor<T>(ignore_index);
-  constexpr int Size = 2;
-  phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(
+  phi::funcs::ElementwiseKernel<T, decltype(functor), 2>(
       dev_ctx, ins, &outs, functor);
   if (normalize) {
-    T *counts = dev_ctx.template Alloc<T>(counts_tensor);
     DenseTensor *norm_tensor = new DenseTensor();
     norm_tensor->Resize({sizeof(T)});
     dev_ctx.template Alloc<T>(norm_tensor);
@@ -89,13 +84,8 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
       reduce_dim.push_back(i);
     }
 
-    kernels::TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
-        dev_ctx,
-        *counts_tensor,
-        norm_tensor,
-        NonzeroFunctor<T>(),
-        reduce_dim,
-        dev_ctx.stream());
+    funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+        dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
     T *norm = dev_ctx.template Alloc<T>(norm_tensor);
     auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
     T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
@@ -105,6 +95,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
                          norm,
                          sizeof(T),
                          dev_ctx.stream());
+    dev_ctx.Wait();
     auto eps = static_cast<T>(1e-5);
     *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
 
@@ -112,8 +103,10 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
     std::vector<DenseTensor *> div_outs = {in_grad};
     auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
     phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
+
     delete norm_tensor;
   }
+  delete counts_tensor;
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
index fb63badf56a75f..b0e9efe5bbafe6 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -69,17 +69,12 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
   dev_ctx.template Alloc<T>(counts_tensor);
   counts_tensor->Resize(out->dims());
 
-  int limit = out->numel();
-  int blocks = NumBlocks(limit);
-  int threads = kNumCUDAThreads;
   std::vector<const DenseTensor *> ins = {&x, &label};
   std::vector<DenseTensor *> outs = {out, counts_tensor};
   auto functor = SigmoidFwdFunctor<T>(ignore_index);
-  constexpr int Size = 2;
-  phi::funcs::ElementwiseKernel<T, decltype(functor), Size>(
+  phi::funcs::ElementwiseKernel<T, decltype(functor), 2>(
       dev_ctx, ins, &outs, functor);
   if (normalize) {
-    T *counts = dev_ctx.template Alloc<T>(counts_tensor);
     DenseTensor *norm_tensor = new DenseTensor();
     norm_tensor->Resize({sizeof(T)});
     dev_ctx.template Alloc<T>(norm_tensor);
@@ -89,13 +84,8 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
       reduce_dim.push_back(i);
     }
 
-    kernels::TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
-        dev_ctx,
-        *counts_tensor,
-        norm_tensor,
-        NonzeroFunctor<T>(),
-        reduce_dim,
-        dev_ctx.stream());
+    funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
+        dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
     T *norm = dev_ctx.template Alloc<T>(norm_tensor);
     auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
     T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
@@ -105,6 +95,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
                          norm,
                          sizeof(T),
                          dev_ctx.stream());
+    dev_ctx.Wait();
     auto eps = static_cast<T>(1e-5);
     *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
 
@@ -114,8 +105,8 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
     phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
 
     delete norm_tensor;
-    delete counts_tensor;
   }
+  delete counts_tensor;
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
new file mode 100644
index 00000000000000..e09cfd370a4f0d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on GPU."));
+
+  // We need to know the shape of input matrix to determine the shape of grad
+  // matrix of input.
+  x_grad->Resize(x.dims());
+  dev_ctx.template Alloc<T>(x_grad);
+
+  // Set to zero tensor.
+  phi::funcs::SetConstant<Context, T> functor;
+  functor(dev_ctx, x_grad, static_cast<T>(0));
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::gpu_scatter_add_kernel<T, int32_t>(
+        *x_grad,
+        axis,
+        index,
+        out_grad,
+        dev_ctx);  // the gradient of gather is scatter
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::gpu_scatter_add_kernel<T, int64_t>(
+        *x_grad, axis, index, out_grad, dev_ctx);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(take_along_axis_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
new file mode 100644
index 00000000000000..9665a917d9dc4a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/take_along_axis_kernel.h"
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      true,
+      errors::PreconditionNotMet("This kernel only runs on GPU device."));
+
+  out->Resize(index.dims());
+  dev_ctx.template Alloc<T>(out);
+
+  const auto& index_type =
+      paddle::framework::TransToProtoVarType(index.dtype());
+  if (index_type == paddle::framework::proto::VarType::INT32) {
+    paddle::operators::gpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, dev_ctx);
+  } else if (index_type == paddle::framework::proto::VarType::INT64) {
+    paddle::operators::gpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, dev_ctx);
+  }
+}
+
+}  // namespace  phi
+
+PD_REGISTER_KERNEL(take_along_axis,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TakeAlongAxisKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
new file mode 100644
index 00000000000000..b0b45223489e93
--- /dev/null
+++ b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_grad_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace ops = paddle::operators;
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  const auto& out_dims = indices.dims();
+
+  // get the real the axis and the k
+  if (axis < 0) {
+    axis += in_dims.size();
+  }
+  const int& raw_height = in_dims[axis];
+
+  // allocate the cuda memory for the x_grad
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* out_grad_data = out_grad.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+
+  int pre, n, post;
+  ops::GetDims(in_dims, axis, &pre, &n, &post);
+
+  // calcluate the block and grid num
+  auto ComputeBlockSize = [](int col) {
+    if (col > 512)
+      return 1024;
+    else if (col > 256 && col <= 512)
+      return 512;
+    else if (col > 128 && col <= 256)
+      return 256;
+    else if (col > 64 && col <= 128)
+      return 128;
+    else
+      return 64;
+  };
+  int block_size = ComputeBlockSize(post * k);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+
+  // lanuch the cuda kernel to assign the grad
+  ops::AssignGradWithAxis<
+      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, k);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TopkGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
new file mode 100644
index 00000000000000..4e9aa88c6cb2da
--- /dev/null
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -0,0 +1,264 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace ops = paddle::operators;
+
+#define FIXED_BLOCK_DIM_BASE(dim, ...) \
+  case (dim): {                        \
+    constexpr auto kBlockDim = (dim);  \
+    __VA_ARGS__;                       \
+  } break
+
+#define FIXED_BLOCK_DIM(...)                \
+  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto* input = &x;
+  // get the input dims
+  const auto& in_dims = input->dims();
+  // calcluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    phi::DDim out_dims = out->dims();
+    out_dims[axis] = k;
+    out->Resize(out_dims);
+    indices->Resize(out_dims);
+  }
+
+  const auto& out_dims = out->dims();
+
+  const T* input_data = input->data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    // if get the topK from the last axis
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+
+    if (k > input_width) {
+      k = input_width;
+    }
+
+    // The conclusion is drawn from the data through multiple sets of
+    // statistics
+    if (input_width >= 128 && k >= input_width * 0.75) {
+      if (ops::SortTopk<T>(
+              paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()),
+              input,
+              input_width,
+              input_height,
+              k,
+              out,
+              indices,
+              largest)) {
+        // Successed, return.
+        return;
+      } else {
+        LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                     "default topk kernel.";
+      }
+    }
+
+    // NOTE: pass lds and dim same to input width.
+    // NOTE: old matrix implementation of stride is different to eigen.
+    const int kMaxHeight = 2048;
+    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+    switch (ops::GetDesiredBlockDim(input_width)) {
+#ifdef PADDLE_WITH_HIP
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      20,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          output_data,
+          k,
+          indices_data,
+          input_data,
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#else
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      5,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          output_data,
+          k,
+          indices_data,
+          input_data,
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#endif
+      default:
+        PADDLE_THROW(errors::Fatal(
+            "the input data shape has error in the topk cuda kernel."));
+    }
+  } else {
+    // if get topK not from the last axis, will tranpose the tensor and get
+    // TopK
+
+    // first step, prepare the trans args for the tranpose
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+
+    phi::DDim trans_dims(in_dims);
+    phi::DDim trans_out_dims(out->dims());
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = out_dims[trans[i]];
+    }
+    // second step, tranpose the input
+    DenseTensor trans_input;
+    trans_input.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_input);
+    int ndims = trans.size();
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, *input, &trans_input, trans);
+    // third step, calcluate the topk
+    // allocate the tmp cuda memory for the tmp result
+    DenseTensor trans_ind;
+    DenseTensor trans_out;
+    trans_ind.Resize(trans_out_dims);
+    trans_out.Resize(trans_out_dims);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    dev_ctx.template Alloc<T>(&trans_out);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+
+    if (k > input_width) k = input_width;
+
+    // The conclusion is drawn from the data through multiple sets of
+    // statistics
+    if (input_width >= 128 && k >= input_width * 0.75) {
+      if (ops::SortTopk<T>(
+              paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()),
+              &trans_input,
+              input_width,
+              input_height,
+              k,
+              &trans_out,
+              &trans_ind,
+              largest)) {
+        // last step, tranpose back the indices and output
+        funcs::TransCompute<phi::GPUContext, int64_t>(
+            ndims, dev_ctx, trans_ind, indices, trans);
+        funcs::TransCompute<phi::GPUContext, T>(
+            ndims, dev_ctx, trans_out, out, trans);
+        return;
+      } else {
+        LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
+                     "default topk kernel.";
+      }
+    }
+
+    const int kMaxHeight = 2048;
+    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
+    switch (ops::GetDesiredBlockDim(input_width)) {
+#ifdef PADDLE_WITH_HIP
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      20,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          trans_out.data<T>(),
+          k,
+          trans_ind.data<int64_t>(),
+          trans_input.data<T>(),
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#else
+      FIXED_BLOCK_DIM(ops::KeMatrixTopK<
+                      T,
+                      5,
+                      kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
+          trans_out.data<T>(),
+          k,
+          trans_ind.data<int64_t>(),
+          trans_input.data<T>(),
+          input_width,
+          input_width,
+          static_cast<int>(k),
+          gridx,
+          input_height,
+          largest));
+#endif
+      default:
+        PADDLE_THROW(errors::Fatal(
+            "the input data shape has error in the topk cuda kernel."));
+    }
+
+    // last step, tranpose back the indices and output
+    funcs::TransCompute<phi::GPUContext, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans);
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, trans_out, out, trans);
+  }
+}
+#undef FIXED_BLOCK_DIM_BASE
+#undef FIXED_BLOCK_DIM
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(top_k,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TopkKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index 4266f0174ff6c1..4a749c5b3347da 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -31,11 +31,10 @@ void TraceKernel(const Context& ctx,
   T* out_data = ctx.template Alloc<T>(out);
   auto diag = funcs::Diagonal<T, Context>(ctx, &x, offset, axis1, axis2);
   if (diag.numel() > 0) {
-    auto stream = ctx.stream();
     std::vector<int> reduce_dims;
     reduce_dims.push_back(out->dims().size());
-    funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims);
   } else {
     phi::funcs::SetConstant<Context, T> functor;
     functor(ctx, out, static_cast<T>(0));
diff --git a/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu
new file mode 100644
index 00000000000000..f7eaa485797947
--- /dev/null
+++ b/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(triangular_solve_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
new file mode 100644
index 00000000000000..f137d8e1c26038
--- /dev/null
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -0,0 +1,132 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out) {
+  // get broadcast dim
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+  int x_bst_ndim = x_bst_dims_vec.size();
+  int y_bst_ndim = y_bst_dims_vec.size();
+
+  // Tensor broadcast to 'out' and temp 'x_bst'
+  ScalarArray x_bst_dims(x_bst_dims_vec);
+  DenseTensor x_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
+  const T* x_bst_data = x_bst.data<T>();
+  ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
+
+  out->Resize(phi::make_ddim(y_bst_dims_vec));
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  ScalarArray y_bst_dims(y_bst_dims_vec);
+  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, out);
+
+  // calculate use cublas library
+  CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
+  CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
+  CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
+
+  int M = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 2]);
+  int N = static_cast<int>(y_bst_dims_vec[y_bst_ndim - 1]);
+  auto lda = std::max(1, M);
+  auto ldb = std::max(1, N);
+
+  int batch_size = 1;
+  for (int i = 0; i < x_bst_ndim - 2; i++) {
+    batch_size *= x_bst_dims_vec[i];
+  }
+
+  auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
+  if (batch_size <= 8 && M >= 64) {
+    for (auto i = 0; i < batch_size; i++) {
+      blas.TRSM(CblasLeft,
+                uplo,
+                transA,
+                diag,
+                M,
+                N,
+                T(1),
+                x_bst_data + i * M * M,
+                lda,
+                out_data + i * N * M,
+                ldb);
+    }
+  } else {
+    std::vector<const T*> cpu_ptrs(batch_size * 2);
+    for (int i = 0; i < batch_size; ++i) {
+      cpu_ptrs[i] = x_bst_data + i * M * M;
+      cpu_ptrs[i + batch_size] = out_data + i * M * N;
+    }
+
+    // Copy the addresses of A and tmp_b from host to device.
+    paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+        paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*));
+
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                         tmp_gpu_ptrs_data->ptr(),
+                         paddle::platform::CPUPlace(),
+                         static_cast<void*>(cpu_ptrs.data()),
+                         cpu_ptrs.size() * sizeof(T*),
+                         dev_ctx.stream());
+
+    const T** gpu_a_ptrs =
+        reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr());
+    T** gpu_b_ptrs =
+        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+    blas.BatchedTRSM(CblasLeft,
+                     uplo,
+                     transA,
+                     diag,
+                     M,
+                     N,
+                     static_cast<T>(1.0),
+                     gpu_a_ptrs,
+                     lda,
+                     gpu_b_ptrs,
+                     ldb,
+                     batch_size);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(triangular_solve,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
index 12c1bf791e1691..f27b32ca7b8319 100644
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -25,7 +25,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 #include "paddle/fluid/framework/generator.h"
-// #include "paddle/phi/core/generator.h"
 
 namespace phi {
 
@@ -87,7 +86,7 @@ struct TruncatedNormalOffset {
 
 template <typename T, typename Context>
 void TruncatedGaussianRandomKernel(const Context& dev_ctx,
-                                   const ScalarArray& shape,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
new file mode 100644
index 00000000000000..25d6d46c20b9fd
--- /dev/null
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -0,0 +1,402 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/viterbi_decode_kernel.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/viterbi_decode_functor.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+int64_t ComputeBlockSize(int64_t col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256)
+    return 512;
+  else if (col > 128)
+    return 256;
+  else if (col > 64)
+    return 128;
+  else if (col > 32)
+    return 64;
+  else if (col > 16)
+    return 32;
+  else if (col > 8)
+    return 16;
+  else
+    return 8;
+}
+
+template <typename Context,
+          template <typename T> typename BinaryFunctor,
+          typename T>
+struct BinaryOperation {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* output) {
+    std::vector<const DenseTensor*> ins{&lhs, &rhs};
+    std::vector<DenseTensor*> outs{output};
+    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                   T,
+                                                   T>(
+        dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
+  }
+};
+
+template <typename Context,
+          template <typename InT, typename OutT> typename CompareFunctor,
+          typename T>
+struct GetMask {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& lhs,
+                  const DenseTensor& rhs,
+                  DenseTensor* mask) {
+    std::vector<const DenseTensor*> ins = {&lhs, &rhs};
+    std::vector<DenseTensor*> outs = {mask};
+    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
+  }
+};
+
+template <typename T, typename IndType, size_t BlockDim>
+__global__ void ArgmaxCUDAKernel(const int64_t height,     // n * h
+                                 const int64_t width,      // c
+                                 const int64_t post_size,  // h
+                                 const T* in,
+                                 IndType* out_idx,
+                                 T* out) {
+  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  cub::ArgMax reducer;
+  T init = (std::numeric_limits<T>::lowest)();  // for windows compile
+  for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    cub::KeyValuePair<int, T> kv_pair = {-1, init};
+    int h = idx / post_size;
+    int w = idx % post_size;
+    for (int k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      // return max, argmax
+      if (out_idx != nullptr) out_idx[idx] = static_cast<IndType>(kv_pair.key);
+      if (out != nullptr) out[idx] = kv_pair.value;
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void ARangeKernel(int64_t* data, int num, int64_t scale) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int start = idx; idx < num; idx += gridDim.x) {
+    data[idx] = idx * scale;
+  }
+}
+
+template <typename Context>
+struct ARange {
+  void operator()(const Context& dev_ctx,
+                  int64_t* data,
+                  int num,
+                  int64_t scale) {
+    int64_t kBlockDim = ComputeBlockSize(num);
+    // kBlockDim > num at most of time, so we can set grid = 1
+    ARangeKernel<<<1, kBlockDim, 0, dev_ctx.stream()>>>(data, num, scale);
+  }
+};
+
+template <typename Context, typename T, typename IndType>
+struct Argmax {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* out_idx,
+                  DenseTensor* out,
+                  int axis) {
+    phi::DDim input_dims = input.dims();
+    int64_t numel = input.numel();
+    int64_t groups = numel / input_dims[axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = input_dims[axis];
+    for (int i = 0; i < axis; i++) {
+      pre *= input_dims[i];
+    }
+    for (int i = axis + 1; i < input_dims.size(); i++) {
+      post *= input_dims[i];
+    }
+    auto cu_stream = dev_ctx.stream();
+    int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
+    int64_t height = pre * post;
+    int64_t width = n;
+    int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+    const T* in_data = input.data<T>();
+    IndType* out_idx_data = out_idx->data<IndType>();
+    T* out_data = out->data<T>();
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgmaxCUDAKernel<T,
+                           IndType,
+                           kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height, width, post, in_data, out_idx_data, out_data));
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct GetMaxValue {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  T* max_value) {
+    DenseTensor out_data;
+    out_data.Resize(phi::make_ddim({1}));
+    dev_ctx.template Alloc<T>(&out_data);
+    switch (ComputeBlockSize(input.numel())) {
+      FIXED_BLOCK_DIM_CASE(
+          ArgmaxCUDAKernel<T,
+                           T,
+                           kBlockDim><<<1, kBlockDim, 0, dev_ctx.stream()>>>(
+              1,
+              input.numel(),
+              1,
+              input.data<int64_t>(),
+              nullptr,
+              out_data.data<int64_t>()));
+    }
+    DenseTensor max_value_tensor;
+    phi::Copy(dev_ctx, out_data, phi::CPUPlace(), false, &max_value_tensor);
+    *max_value = max_value_tensor.data<T>()[0];
+  }
+};
+
+template <typename Context, typename T, typename IndexT>
+struct Gather {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& src,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
+    phi::funcs::GPUGather<T, IndexT>(dev_ctx, src, index, output);
+  }
+};
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path) {
+  auto curr_place = dev_ctx.GetPlace();
+  auto batch_size = static_cast<int>(input.dims()[0]);
+  auto seq_len = static_cast<int>(input.dims()[1]);
+  auto n_labels = static_cast<int>(input.dims()[2]);
+  phi::funcs::SetConstant<Context, T> float_functor;
+  phi::funcs::SetConstant<Context, int64_t> int_functor;
+  std::vector<DenseTensor> historys;
+  // We create tensor buffer in order to avoid allocating memory frequently
+  // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
+  int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
+  DenseTensor int_buffer = Empty<int64_t>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer int_tensor_buffer(int_buffer);
+  // create float tensor buffer
+  // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
+  buffer_size = batch_size * (seq_len + 10) * n_labels +
+                (batch_size + 2) * n_labels * n_labels;
+  DenseTensor float_buffer = Empty<T>(dev_ctx, {buffer_size});
+  funcs::TensorBuffer float_tensor_buffer(float_buffer);
+  DenseTensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  phi::Copy(dev_ctx, length, curr_place, false, &left_length);
+  int64_t max_seq_len = 0;
+  GetMaxValue<Context, int64_t> get_max_value;
+  get_max_value(dev_ctx, left_length, &max_seq_len);
+  dev_ctx.template Alloc<T>(scores);
+  path->Resize({batch_size, max_seq_len});
+  dev_ctx.template Alloc<int64_t>(path);
+  DenseTensor tpath =
+      int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+  auto batch_path = funcs::Unbind(tpath);
+  for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
+    it->Resize({batch_size});
+  }
+  // create and init required tensor
+  DenseTensor input_exp =
+      float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  TransposeKernel<T, Context>(dev_ctx, input, {1, 0, 2}, &input_exp);
+  DenseTensor trans_exp =
+      float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+  phi::Copy(dev_ctx, transition, curr_place, false, &trans_exp);
+  trans_exp.Resize({1, n_labels, n_labels});
+  DenseTensor alpha =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &zero, 0);
+  DenseTensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+  int_functor(dev_ctx, &one, 1);
+  DenseTensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
+  DenseTensor alpha_trn_sum =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
+  DenseTensor alpha_max =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor alpha_argmax =
+      int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
+  auto alpha_argmax_unbind = funcs::Unbind(alpha_argmax);
+  DenseTensor alpha_nxt =
+      float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+  DenseTensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+  DenseTensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor start_trans =
+      float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+  DenseTensor rest_trans =
+      float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
+  DenseTensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
+  DenseTensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
+  std::vector<const DenseTensor*> shape{&rest_trans, &stop_trans, &start_trans};
+  std::vector<DenseTensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+  phi::funcs::SplitFunctor<Context, T> split_functor;
+  split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
+  stop_trans.Resize({1, n_labels});
+  start_trans.Resize({1, n_labels});
+  auto logit0 = input_exp.Slice(0, 1);
+  logit0.Resize({batch_size, n_labels});
+  BinaryOperation<Context, phi::funcs::AddFunctor, T> AddFloat;
+  BinaryOperation<Context, phi::funcs::AddFunctor, int64_t> AddInt;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, T> MulFloat;
+  BinaryOperation<Context, phi::funcs::MultiplyFunctor, int64_t> MulInt;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, T> SubFloat;
+  BinaryOperation<Context, phi::funcs::SubtractFunctor, int64_t> SubInt;
+  if (include_bos_eos_tag) {
+    AddFloat(dev_ctx, logit0, start_trans, &alpha);
+    GetMask<Context, phi::funcs::EqualFunctor, T>()(
+        dev_ctx, left_length, one, &float_mask);
+    MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+  } else {
+    alpha = logit0;
+  }
+  SubInt(dev_ctx, left_length, one, &left_length);
+  Argmax<Context, T, int64_t> argmax;
+  for (int64_t i = 1; i < max_seq_len; ++i) {
+    DenseTensor logit = input_exp.Slice(i, i + 1);
+    logit.Resize({batch_size, n_labels});
+    DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+    AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
+    auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
+    alpha_argmax_temp.Resize({batch_size, n_labels});
+    argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1);
+    historys.emplace_back(alpha_argmax_temp);
+    AddFloat(dev_ctx, alpha_max, logit, &alpha_nxt);
+    alpha.Resize({batch_size, n_labels});
+    GetMask<Context, phi::funcs::GreaterThanFunctor, T>()(
+        dev_ctx, left_length, zero, &float_mask);
+    MulFloat(dev_ctx, alpha_nxt, float_mask, &alpha_nxt);
+    SubFloat(dev_ctx, float_one, float_mask, &float_mask);
+    MulFloat(dev_ctx, alpha, float_mask, &alpha);
+    AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    if (include_bos_eos_tag) {
+      GetMask<Context, phi::funcs::EqualFunctor, T>()(
+          dev_ctx, left_length, one, &float_mask);
+      MulFloat(dev_ctx, stop_trans, float_mask, &alpha_nxt);
+      AddFloat(dev_ctx, alpha, alpha_nxt, &alpha);
+    }
+    SubInt(dev_ctx, left_length, one, &left_length);
+  }
+  argmax(dev_ctx, alpha, &last_ids, scores, 1);
+  left_length.Resize({batch_size});
+  GetMask<Context, phi::funcs::GreaterEqualFunctor, int64_t>()(
+      dev_ctx, left_length, zero, &int_mask);
+  // last_ids_update = last_ids * tag_mask
+  int last_ids_index = 1;
+  int actual_len = (std::min)(seq_len, static_cast<int>(max_seq_len));
+  MulInt(dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]);
+  // The algorithm below can refer to
+  // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438
+  ARange<Context> arange;
+  arange(dev_ctx, batch_offset.data<int64_t>(), batch_size, n_labels);
+  Gather<Context, int64_t, int64_t> gather;
+  for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) {
+    ++last_ids_index;
+    AddInt(dev_ctx, left_length, one, &left_length);
+    AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
+    DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index];
+    hist->Resize({batch_size * n_labels});
+    gather(dev_ctx, *hist, gather_idx, &last_ids_update);
+    GetMask<Context, phi::funcs::GreaterThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids_update, int_mask, &last_ids_update);
+    GetMask<Context, phi::funcs::EqualFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &zero_len_mask);
+    MulInt(dev_ctx, last_ids, zero_len_mask, &last_ids_tmp);
+    SubInt(dev_ctx, one, zero_len_mask, &zero_len_mask);
+    MulInt(dev_ctx, last_ids_update, zero_len_mask, &last_ids_update);
+    AddInt(dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update);
+    GetMask<Context, phi::funcs::LessThanFunctor, int64_t>()(
+        dev_ctx, left_length, zero, &int_mask);
+    MulInt(dev_ctx, last_ids, int_mask, &last_ids);
+    AddInt(dev_ctx, last_ids_update, last_ids, &last_ids);
+  }
+  TransposeKernel<int64_t, Context>(dev_ctx, tpath, {1, 0}, path);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    viterbi_decode, GPU, ALL_LAYOUT, phi::ViterbiDecodeKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu
new file mode 100644
index 00000000000000..535cb812a20ea9
--- /dev/null
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -0,0 +1,178 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void GetTrueNum(const T *cond_data,
+                           const int64_t numel,
+                           int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    true_num_array[idx] =
+        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
+  }
+}
+
+template <typename T>
+__global__ void SetTrueIndex(int64_t *out_ptr,
+                             const T *cond_data,
+                             const int64_t numel,
+                             const int64_t *stride_array,
+                             const int64_t rank,
+                             const int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    // true_num_array is calculated by cub::InclusiveSum,
+    // cause the first element of true_num_array is 1,
+    // so we need substract 1 to get true index.
+    const int64_t true_index = true_num_array[idx] - 1;
+    if (static_cast<bool>(cond_data[idx])) {
+      int64_t rank_index = idx;
+      for (int j = 0; j < rank; j++) {
+        const int64_t out_index = rank_index / stride_array[j];
+        out_ptr[true_index * rank + j] = out_index;
+        rank_index -= out_index * stride_array[j];
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context &dev_ctx,
+                      const DenseTensor &condition,
+                      DenseTensor *out) {
+  const T *cond_data = condition.data<T>();
+  const int64_t numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  auto d_array_mem =
+      paddle::memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
+  auto h_array_mem =
+      paddle::memory::Alloc(phi::CPUPlace(), (rank + 1) * sizeof(int64_t));
+
+  // "stride_array" is an array and len(stride_array)==rank,
+  // each element is the stride of each dimension -- the length from i to i+1.
+  int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
+  int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
+
+  // "true_num_array" is an array and len(stride_array)==numel,
+  // at the beginning,
+  // "true_num_array" will set 1 if condition[i] == true else 0,
+  // then it will be calculated by cub::InclusiveSum,
+  // so that we can get the true number before i as the out index
+  int64_t *d_true_num_array = d_stride_array + rank;
+
+  // the total_true_num is the total number of condition[i] == true
+  int64_t *h_total_true_num = h_stride_array + rank;
+
+  // alloce cub memory
+  size_t cub_size = 0;
+  cub::DeviceScan::InclusiveSum(nullptr,
+                                cub_size,
+                                d_true_num_array,
+                                d_true_num_array,
+                                numel,
+                                dev_ctx.stream());
+  auto cub_mem = paddle::memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
+  void *cub_data = cub_mem->ptr();
+
+  // set d_true_num_array[i]=1 if cond_data[i]==true else 0
+  const int threads = std::min(numel, static_cast<int64_t>(128));
+  const int64_t need_grids = (numel + threads - 1) / threads;
+  const int grids = std::min(need_grids, static_cast<int64_t>(256));
+  GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(
+      cond_data, numel, d_true_num_array);
+
+  // calculate the inclusive prefix sum of "true_num_array"
+  // to get the index of "out" tensor,
+  // and the total number of cond_data[i]==true.
+  // Example:
+  // condition: F T T F F F T T
+  // before:    0 1 1 0 0 0 1 1
+  // after:     0 1 2 2 2 2 3 4
+  // out:       1 2 6 7
+  cub::DeviceScan::InclusiveSum(cub_data,
+                                cub_size,
+                                d_true_num_array,
+                                d_true_num_array,
+                                numel,
+                                dev_ctx.stream());
+
+  // calculate each dimension's stride
+  h_stride_array[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
+  }
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       d_stride_array,
+                       phi::CPUPlace(),
+                       h_stride_array,
+                       rank * sizeof(int64_t),
+                       dev_ctx.stream());
+
+  // get total ture number and set output size
+  // the last element of cub::InclusiveSum is the total number
+  paddle::memory::Copy(phi::CPUPlace(),
+                       h_total_true_num,
+                       dev_ctx.GetPlace(),
+                       d_true_num_array + numel - 1,
+                       sizeof(int64_t),
+                       dev_ctx.stream());
+  dev_ctx.Wait();
+
+  int64_t true_num = *h_total_true_num;
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
+  auto *out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num == 0) {
+    return;
+  }
+
+  // using true_num_array and stride_array to calculate the output index
+  SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
+      out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::WhereIndexKernel,
+                   int64_t,
+                   int,
+                   int16_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
new file mode 100644
index 00000000000000..2719dcd9e54094
--- /dev/null
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -0,0 +1,182 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/yolo_box_util.h"
+#include "paddle/phi/kernels/yolo_box_kernel.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void KeYoloBoxFw(const T* input,
+                            const int* imgsize,
+                            T* boxes,
+                            T* scores,
+                            const float conf_thresh,
+                            const int* anchors,
+                            const int n,
+                            const int h,
+                            const int w,
+                            const int an_num,
+                            const int class_num,
+                            const int box_num,
+                            int input_size_h,
+                            int input_size_w,
+                            bool clip_bbox,
+                            const float scale,
+                            const float bias,
+                            bool iou_aware,
+                            const float iou_aware_factor) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  T box[4];
+  for (; tid < n * box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+
+    int an_stride = (5 + class_num) * grid_num;
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
+
+    int obj_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 4, iou_aware);
+    T conf = funcs::sigmoid<T>(input[obj_idx]);
+    if (iou_aware) {
+      int iou_idx =
+          funcs::GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
+      T iou = funcs::sigmoid<T>(input[iou_idx]);
+      conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+             pow(iou, static_cast<T>(iou_aware_factor));
+    }
+    if (conf < conf_thresh) {
+      continue;
+    }
+
+    int box_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 0, iou_aware);
+    funcs::GetYoloBox<T>(box,
+                         input,
+                         anchors,
+                         l,
+                         k,
+                         j,
+                         h,
+                         w,
+                         input_size_h,
+                         input_size_w,
+                         box_idx,
+                         grid_num,
+                         img_height,
+                         img_width,
+                         scale,
+                         bias);
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    funcs::CalcDetectionBox<T>(
+        boxes, box, box_idx, img_height, img_width, clip_bbox);
+
+    int label_idx = funcs::GetEntryIndex(
+        i, j, k * w + l, an_num, an_stride, grid_num, 5, iou_aware);
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
+    funcs::CalcLabelScore<T>(
+        scores, input, label_idx, score_idx, class_num, conf, grid_num);
+  }
+}
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores) {
+  auto* input = &x;
+  float scale = scale_x_y;
+  float bias = -0.5 * (scale - 1.);
+
+  const int n = input->dims()[0];
+  const int h = input->dims()[2];
+  const int w = input->dims()[3];
+  const int box_num = boxes->dims()[1];
+  const int an_num = anchors.size() / 2;
+  int input_size_h = downsample_ratio * h;
+  int input_size_w = downsample_ratio * w;
+
+  int bytes = sizeof(int) * anchors.size();
+  auto anchors_ptr =
+      paddle::memory::Alloc(dev_ctx, sizeof(int) * anchors.size());
+  int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
+  const auto gplace = dev_ctx.GetPlace();
+  const auto cplace = phi::CPUPlace();
+  paddle::memory::Copy(
+      gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream());
+
+  const T* input_data = input->data<T>();
+  const int* imgsize_data = img_size.data<int>();
+  T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, dev_ctx.GetPlace());
+  T* scores_data =
+      scores->mutable_data<T>({n, box_num, class_num}, dev_ctx.GetPlace());
+  phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
+  set_zero(dev_ctx, boxes, static_cast<T>(0));
+  set_zero(dev_ctx, scores, static_cast<T>(0));
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * box_num);
+
+  dim3 thread_num = config.thread_per_block;
+#ifdef WITH_NV_JETSON
+  if (config.compute_capability == 53 || config.compute_capability == 62) {
+    thread_num = 512;
+  }
+#endif
+
+  KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0, dev_ctx.stream()>>>(
+      input_data,
+      imgsize_data,
+      boxes_data,
+      scores_data,
+      conf_thresh,
+      anchors_data,
+      n,
+      h,
+      w,
+      an_num,
+      class_num,
+      box_num,
+      input_size_h,
+      input_size_w,
+      clip_bbox,
+      scale,
+      bias,
+      iou_aware,
+      iou_aware_factor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    yolo_box, GPU, ALL_LAYOUT, phi::YoloBoxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
similarity index 100%
rename from paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
similarity index 100%
rename from paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
similarity index 100%
rename from paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/conv_kernel.cu
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
similarity index 100%
rename from paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
similarity index 100%
rename from paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
rename to paddle/phi/kernels/gpudnn/softmax_kernel.cu
diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
new file mode 100644
index 00000000000000..d163e6e278a075
--- /dev/null
+++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GraphSendRecvGradKernel(const Context& ctx,
+                             const DenseTensor& out_grad,
+                             paddle::optional<const DenseTensor&> x,
+                             paddle::optional<const DenseTensor&> out,
+                             const DenseTensor& src_index,
+                             const DenseTensor& dst_index,
+                             paddle::optional<const DenseTensor&> dst_count,
+                             const std::string& pool_type,
+                             DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h
new file mode 100644
index 00000000000000..95dbdc4443ad00
--- /dev/null
+++ b/paddle/phi/kernels/graph_send_recv_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GraphSendRecvKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& src_index,
+                         const DenseTensor& dst_index,
+                         const std::string& pool_type,
+                         DenseTensor* out,
+                         DenseTensor* dst_count);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
new file mode 100644
index 00000000000000..80e23d2b8e24b8
--- /dev/null
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+void ActivationGradImpl(const Context& dev_ctx,
+                        const DenseTensor* X,
+                        const DenseTensor* Out,
+                        const DenseTensor* dOut,
+                        DenseTensor* dX,
+                        const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        Out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(
+      dOut, errors::NotFound("The input DenseTensor dOut can not be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      dX, errors::NotFound("The output DenseTensor dX can not be nullptr"));
+  if (!Out) {
+    Out = dOut;  // fake out
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        X, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    X = dX;
+  }
+
+  dev_ctx.template Alloc<T>(dX);
+  auto dout = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(dOut, "Input", "Out@GRAD", "ActivationGrad"));
+  auto out = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(Out, "Input", "Out", "ActivationGrad"));
+  auto dx = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(dX, "Input", "X@GRAD", "ActivationGrad"));
+  auto x = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(X, "Input", "X", "ActivationGrad"));
+  auto* place = dev_ctx.eigen_device();
+  // use 32bit index to speed up computation
+  bool use_32bit_index = out.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace());
+  if (use_32bit_index && is_gpu_place) {
+    functor(*place,
+            To32BitIndex(x),
+            To32BitIndex(out),
+            To32BitIndex(dout),
+            To32BitIndex(dx));
+  } else {
+    functor(*place, x, out, dout, dx);
+  }
+}
+
+template <typename T, typename Context, typename Functor>
+void ActivationDoubleGradImpl(const Context& dev_ctx,
+                              const DenseTensor* X,
+                              const DenseTensor* Out,
+                              const DenseTensor* ddX,
+                              DenseTensor* dX,
+                              DenseTensor* dOut,
+                              DenseTensor* ddOut,
+                              const Functor& functor) {
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepX)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        X, errors::NotFound("The input DenseTensor X can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    X = ddX;
+  }
+  if (static_cast<int>(Functor::FwdDeps()) &
+      static_cast<int>(funcs::ActBwdOpFwdDeps::kDepOut)) {
+    PADDLE_ENFORCE_NOT_NULL(
+        Out, errors::NotFound("The input DenseTensor Out can not be nullptr"));
+  } else {
+    VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name();
+    Out = ddX;
+  }
+
+  if (ddOut) {
+    dev_ctx.template Alloc<T>(ddOut);
+  }
+  if (dOut) {
+    dev_ctx.template Alloc<T>(dOut);
+  }
+  if (dX) {
+    dX->Resize(Out->dims());
+    dev_ctx.template Alloc<T>(dX);
+  }
+
+  functor(dev_ctx, X, Out, ddX, ddOut, dOut, dX);
+}
+
+template <typename T, typename Context>
+void ReluDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          DenseTensor* ddout) {
+  funcs::ReluGradGradFunctor<T> relu_double_grad_functor;
+  ActivationDoubleGradImpl<T, Context, funcs::ReluGradGradFunctor<T>>(
+      dev_ctx,
+      nullptr,
+      &out,
+      &ddx,
+      nullptr,
+      nullptr,
+      ddout,
+      relu_double_grad_functor);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/activation_impl.h b/paddle/phi/kernels/impl/activation_impl.h
new file mode 100644
index 00000000000000..ca3debd394a1e5
--- /dev/null
+++ b/paddle/phi/kernels/impl/activation_impl.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace phi {
+
+#define ToString(x) #x
+
+template <typename T, typename Context, typename Functor>
+void ActivationImpl(const Context& dev_ctx,
+                    const DenseTensor& X,
+                    DenseTensor* Out,
+                    const Functor& functor) {
+  PADDLE_ENFORCE_NOT_NULL(Out,
+                          errors::NotFound("Output Out should not be nullptr"));
+  dev_ctx.template Alloc<T>(Out);
+  auto x = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&X, "Input", "X", "Activation"));
+  auto out = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(Out, "Output", "Out", "Activation"));
+  auto* place = dev_ctx.eigen_device();
+  // use 32bit index to speed up computation
+  bool use_32bit_index = out.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace());
+  if (use_32bit_index && is_gpu_place) {
+    functor(*place, To32BitIndex(x), To32BitIndex(out));
+  } else {
+    functor(*place, x, out);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/adadelta_kernel_impl.h b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
new file mode 100644
index 00000000000000..3fbdf435bab39f
--- /dev/null
+++ b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/adadelta_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdadeltaKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& grad,
+                    const DenseTensor& avg_squared_grad,
+                    const DenseTensor& avg_squared_update,
+                    float rho,
+                    float epsilon,
+                    DenseTensor* param_out,
+                    DenseTensor* avg_squared_grad_out,
+                    DenseTensor* avg_squared_update_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  dev_ctx.template Alloc<T>(avg_squared_grad_out);
+  dev_ctx.template Alloc<T>(avg_squared_update_out);
+
+  T rho_ = static_cast<T>(rho);
+  T epsilon_ = static_cast<T>(epsilon);
+
+  auto eigen_param = EigenVector<T>::Flatten(param);
+  auto eigen_grad = EigenVector<T>::Flatten(grad);
+  // Squared gradient accumulator
+  auto eigen_avg_squared_grad = EigenVector<T>::Flatten(avg_squared_grad);
+  // Squared updates accumulator
+  auto eigen_avg_squared_update = EigenVector<T>::Flatten(avg_squared_update);
+  auto eigen_param_out = EigenVector<T>::Flatten(*param_out);
+  auto eigen_avg_squared_grad_out =
+      EigenVector<T>::Flatten(*avg_squared_grad_out);
+  auto eigen_avg_squared_update_out =
+      EigenVector<T>::Flatten(*avg_squared_update_out);
+  auto& place = *dev_ctx.eigen_device();
+
+  eigen_avg_squared_grad_out.device(place) =
+      rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad.square();
+  auto update = -((eigen_avg_squared_update + epsilon_) /
+                  (eigen_avg_squared_grad_out + epsilon_))
+                     .sqrt() *
+                eigen_grad;
+  eigen_avg_squared_update_out.device(place) =
+      rho_ * eigen_avg_squared_update + (1 - rho_) * update.square();
+  eigen_param_out.device(place) = eigen_param + update;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/adamax_kernel_impl.h b/paddle/phi/kernels/impl/adamax_kernel_impl.h
new file mode 100644
index 00000000000000..bff553319a2b98
--- /dev/null
+++ b/paddle/phi/kernels/impl/adamax_kernel_impl.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/adamax_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AdamaxKernel(const Context& dev_ctx,
+                  const DenseTensor& param,
+                  const DenseTensor& grad,
+                  const DenseTensor& learning_rate,
+                  const DenseTensor& moment,
+                  const DenseTensor& inf_norm,
+                  const DenseTensor& beta1_pow,
+                  float beta1,
+                  float beta2,
+                  float epsilon,
+                  DenseTensor* param_out,
+                  DenseTensor* moment_out,
+                  DenseTensor* inf_norm_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  dev_ctx.template Alloc<T>(moment_out);
+  dev_ctx.template Alloc<T>(inf_norm_out);
+
+  T beta1_ = static_cast<T>(beta1);
+  T beta2_ = static_cast<T>(beta2);
+  T epsilon_ = static_cast<T>(epsilon);
+
+  auto eigen_param = EigenVector<T>::Flatten(param);
+  auto eigen_grad = EigenVector<T>::Flatten(grad);
+  auto eigen_moment = EigenVector<T>::Flatten(moment);
+  auto eigen_inf_norm = EigenVector<T>::Flatten(inf_norm);
+  auto eigen_lr = EigenVector<T>::Flatten(learning_rate);
+  auto eigen_beta1_pow = EigenVector<T>::Flatten(beta1_pow);
+
+  auto eigen_param_out = EigenVector<T>::Flatten(*param_out);
+  auto eigen_moment_out = EigenVector<T>::Flatten(*moment_out);
+  auto eigen_inf_norm_out = EigenVector<T>::Flatten(*inf_norm_out);
+
+  auto& place = *dev_ctx.eigen_device();
+
+  eigen_moment_out.device(place) =
+      beta1_ * eigen_moment + (1 - beta1_) * eigen_grad;
+  eigen_inf_norm_out.device(place) =
+      eigen_grad.abs().cwiseMax((beta2_ * eigen_inf_norm) + epsilon_);
+  auto lr_t = eigen_lr / (1 - eigen_beta1_pow);
+  Eigen::DSizes<int, 1> m_dsize(moment_out->numel());
+  eigen_param_out.device(place) =
+      eigen_param -
+      lr_t.broadcast(m_dsize) * (eigen_moment_out / eigen_inf_norm_out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/dist_grad_kernel_impl.h b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h
new file mode 100644
index 00000000000000..fc118a832dc9f2
--- /dev/null
+++ b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h
@@ -0,0 +1,223 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ETensor = phi::EigenTensor<T, D, MajorType, IndexType>;
+
+template <int Rank>
+static void GetBraodcastDims(const phi::DDim& x_dims,
+                             const phi::DDim& y_dims,
+                             Eigen::DSizes<int, Rank>* x_bcast_dims,
+                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
+  int bcast_dims_remainder = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    if (x_dims[i] >= y_dims[i]) {
+      (*x_bcast_dims)[i] = 1;
+      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
+      bcast_dims_remainder += x_dims[i] % y_dims[i];
+    } else {
+      (*y_bcast_dims)[i] = 1;
+      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
+      bcast_dims_remainder += y_dims[i] % x_dims[i];
+    }
+  }
+  PADDLE_ENFORCE_EQ(bcast_dims_remainder,
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "The input tensor of Op(dist) could not be broadcast, "
+                        "X's shape is [%s], Y's shape is [%s].",
+                        x_dims,
+                        y_dims));
+}
+
+static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) {
+  std::vector<int64_t> new_dims_vec(rank);
+  if (in_dims.size() < rank) {
+    for (int i = 0; i < rank - in_dims.size(); ++i) {
+      new_dims_vec[i] = 1;
+    }
+    for (int i = 0; i < in_dims.size(); ++i) {
+      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
+    }
+  } else {
+    new_dims_vec = vectorize(in_dims);
+  }
+  return phi::make_ddim(new_dims_vec);
+}
+
+template <typename Context, typename T, int Rank>
+static void DistGradFunction(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             const DenseTensor& out,
+                             const DenseTensor& out_grad,
+                             float p,
+                             DenseTensor* x_grad,
+                             DenseTensor* y_grad) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto out_dims = out.dims();
+
+  phi::DDim x_new_dims = GetNewDims(x_dims, Rank);
+  phi::DDim y_new_dims = GetNewDims(y_dims, Rank);
+  phi::DDim out_new_dims = GetNewDims(out_dims, Rank);
+  auto x_t = ETensor<T, Rank>::From(x, x_new_dims);
+  auto y_t = ETensor<T, Rank>::From(y, y_new_dims);
+  auto out_t = ETensor<T, Rank>::From(out, out_new_dims);
+
+  Eigen::DSizes<int, Rank> x_bcast_dims;
+  Eigen::DSizes<int, Rank> y_bcast_dims;
+  Eigen::DSizes<int, Rank> out_bcast_dims;
+
+  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
+  std::vector<int64_t> new_dims_vec(Rank);
+  for (int i = 0; i < Rank; ++i) {
+    new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]);
+    out_bcast_dims[i] = new_dims_vec[i];
+  }
+  phi::DDim new_dims = phi::make_ddim(new_dims_vec);
+
+  auto& place = *dev_ctx.eigen_device();
+  auto out_grad_t = ETensor<T, Rank>::From(out_grad, out_new_dims);
+  DenseTensor grad;
+  grad.Resize(new_dims);
+  dev_ctx.template Alloc<T>(&grad);
+  auto grad_t = ETensor<T, Rank>::From(grad);
+
+  auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims);
+  auto x_minux_y_abs = x_minux_y.abs();
+  auto sign =
+      (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
+      (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
+  T epsilon = static_cast<T>(1.0e-10f);
+
+  // 1: Lp-norm(z), z = x-y, compute dz
+  if (p == 0) {
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, &grad, static_cast<T>(0));
+  } else if (p == INFINITY || p == -INFINITY) {
+    // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
+    // j!=i, or equals to sign(z_i) * dout if j=i.
+    if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign * out_grad_t.broadcast(out_bcast_dims);
+    }
+  } else {
+    // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
+    if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
+      grad_t.device(place) =
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) =
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign * out_grad_t.broadcast(out_bcast_dims);
+    }
+  }
+
+  Eigen::DSizes<int, Rank * 2> x_reshape_dims;
+  Eigen::DSizes<int, Rank * 2> y_reshape_dims;
+  Eigen::DSizes<int, Rank> reduce_dims;
+  for (int i = 0; i < x_new_dims.size(); ++i) {
+    x_reshape_dims[2 * i] = x_bcast_dims[i];
+    x_reshape_dims[2 * i + 1] = x_new_dims[i];
+    y_reshape_dims[2 * i] = y_bcast_dims[i];
+    y_reshape_dims[2 * i + 1] = y_new_dims[i];
+    reduce_dims[i] = 2 * i;
+  }
+
+  // 2: if x or y is broadcasted in forward function,
+  // the grad need to be sum along the broadcasted dimensions
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    auto x_grad_t = ETensor<T, Rank>::From(*x_grad, x_new_dims);
+    x_grad_t.device(place) = grad_t.reshape(x_reshape_dims)
+                                 .sum(reduce_dims)
+                                 .reshape(x_grad_t.dimensions());
+  }
+  if (y_grad) {
+    dev_ctx.template Alloc<T>(y_grad);
+    auto y_grad_t = ETensor<T, Rank>::From(*y_grad, y_new_dims);
+    y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims)
+                                  .sum(reduce_dims)
+                                  .reshape(y_grad_t.dimensions());
+  }
+}
+
+template <typename T, typename Context>
+void DistGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out,
+                    const DenseTensor& out_grad,
+                    float p,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad) {
+  auto x_rank = x.dims().size();
+  auto y_rank = y.dims().size();
+  auto rank = std::max(x_rank, y_rank);
+  PADDLE_ENFORCE_LE(rank,
+                    6,
+                    phi::errors::Unimplemented(
+                        "Op(dist) only support tensors with no more than 6 "
+                        "dimensions, but X's rank is %d, Y's rank is %d.",
+                        x_rank,
+                        y_rank));
+  switch (rank) {
+    case 1:
+      DistGradFunction<Context, T, 1>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 2:
+      DistGradFunction<Context, T, 2>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 3:
+      DistGradFunction<Context, T, 3>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 4:
+      DistGradFunction<Context, T, 4>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 5:
+      DistGradFunction<Context, T, 5>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+    case 6:
+      DistGradFunction<Context, T, 6>(
+          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/dist_kernel_impl.h b/paddle/phi/kernels/impl/dist_kernel_impl.h
new file mode 100644
index 00000000000000..397fc1b9224339
--- /dev/null
+++ b/paddle/phi/kernels/impl/dist_kernel_impl.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <math.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ETensor = phi::EigenTensor<T, D, MajorType, IndexType>;
+
+template <int Rank>
+static void GetBraodcastDims(const phi::DDim& x_dims,
+                             const phi::DDim& y_dims,
+                             Eigen::DSizes<int, Rank>* x_bcast_dims,
+                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
+  int bcast_dims_remainder = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    if (x_dims[i] >= y_dims[i]) {
+      (*x_bcast_dims)[i] = 1;
+      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
+      bcast_dims_remainder += x_dims[i] % y_dims[i];
+    } else {
+      (*y_bcast_dims)[i] = 1;
+      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
+      bcast_dims_remainder += y_dims[i] % x_dims[i];
+    }
+  }
+  PADDLE_ENFORCE_EQ(bcast_dims_remainder,
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "The input tensor of Op(dist) could not be broadcast, "
+                        "X's shape is [%s], Y's shape is [%s].",
+                        x_dims,
+                        y_dims));
+}
+
+static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) {
+  std::vector<int64_t> new_dims_vec(rank);
+  if (in_dims.size() < rank) {
+    for (int i = 0; i < rank - in_dims.size(); ++i) {
+      new_dims_vec[i] = 1;
+    }
+    for (int i = 0; i < in_dims.size(); ++i) {
+      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
+    }
+  } else {
+    new_dims_vec = vectorize(in_dims);
+  }
+  return phi::make_ddim(new_dims_vec);
+}
+
+template <typename Context, typename T, int Rank>
+static void DistFunction(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         float p,
+                         DenseTensor* out) {
+  if (out) {
+    dev_ctx.template Alloc<T>(out);
+  }
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3))
+  phi::DDim x_new_dims = GetNewDims(x_dims, Rank);
+  phi::DDim y_new_dims = GetNewDims(y_dims, Rank);
+
+  auto x_t = ETensor<T, Rank>::From(x, x_new_dims);
+  auto y_t = ETensor<T, Rank>::From(y, y_new_dims);
+  auto out_t = ETensor<T, 1>::From(*out);
+  auto& place = *dev_ctx.eigen_device();
+
+  Eigen::DSizes<int, Rank> x_bcast_dims;
+  Eigen::DSizes<int, Rank> y_bcast_dims;
+  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
+  // p=0 means number of non-zero elements of (x-y)
+  // p=inf means the maximum of |x-y|
+  // p=-inf means the minimum of |x-y|
+  // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p)
+  if (p == 0) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims))
+            .template cast<T>()
+            .sum();
+  } else if (p == INFINITY) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .maximum();
+  } else if (p == -INFINITY) {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .minimum();
+  } else {
+    out_t.device(place) =
+        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
+            .abs()
+            .pow(p)
+            .sum()
+            .pow(1.0 / p);
+  }
+}
+
+template <typename T, typename Context>
+void DistKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                float p,
+                DenseTensor* out) {
+  auto x_rank = x.dims().size();
+  auto y_rank = y.dims().size();
+  auto rank = std::max(x_rank, y_rank);
+  PADDLE_ENFORCE_LE(rank,
+                    6,
+                    phi::errors::Unimplemented(
+                        "Op(dist) only support tensors with no more than 6 "
+                        "dimensions, but X's rank is %d, Y's rank is %d.",
+                        x_rank,
+                        y_rank));
+  switch (rank) {
+    case 1:
+      DistFunction<Context, T, 1>(dev_ctx, x, y, p, out);
+      break;
+    case 2:
+      DistFunction<Context, T, 2>(dev_ctx, x, y, p, out);
+      break;
+    case 3:
+      DistFunction<Context, T, 3>(dev_ctx, x, y, p, out);
+      break;
+    case 4:
+      DistFunction<Context, T, 4>(dev_ctx, x, y, p, out);
+      break;
+    case 5:
+      DistFunction<Context, T, 5>(dev_ctx, x, y, p, out);
+      break;
+    case 6:
+      DistFunction<Context, T, 6>(dev_ctx, x, y, p, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
new file mode 100644
index 00000000000000..2f0530b638f5ea
--- /dev/null
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/diag_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EighGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_w,
+                    const DenseTensor& out_v,
+                    const DenseTensor& dout_w,
+                    const DenseTensor& dout_v,
+                    DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+  auto& dims = out_v.dims();
+  const int m = dims[dims.size() - 1];
+  DenseTensor tV =
+      phi::TransposeLast2Dim<T>(dev_ctx, phi::Conj<T>(dev_ctx, out_v));
+  DenseTensor W =
+      phi::Subtract<phi::dtype::Real<T>>(dev_ctx,
+                                         phi::funcs::Unsqueeze(out_w, -2),
+                                         phi::funcs::Unsqueeze(out_w, -1));
+  DenseTensor result = phi::Matmul<T>(dev_ctx, tV, dout_v);
+  result.Resize(dims);
+  dev_ctx.template Alloc<T>(&result);
+
+  std::vector<int> out_shape = phi::vectorize<int>(dims);
+  DenseTensor constant;
+  constant.Resize(phi::make_ddim(out_shape));
+  dev_ctx.template Alloc<T>(&constant);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, &constant, T(0.5));
+  result = phi::Subtract<T>(
+      dev_ctx,
+      result,
+      phi::Conj<T>(dev_ctx, phi::TransposeLast2Dim<T>(dev_ctx, result)));
+  result = phi::Multiply<T>(dev_ctx, result, constant);
+  if (result.type() != W.type()) {
+    auto x_vector = EigenVector<T>::Flatten(result);
+    auto y_vector = EigenVector<phi::dtype::Real<T>>::Flatten(W);
+    auto out_vector = EigenVector<T>::Flatten(result);
+    auto& place = *dev_ctx.eigen_device();
+    out_vector.device(place) = x_vector / y_vector;
+  } else {
+    result = phi::Divide<T>(dev_ctx, result, W);
+  }
+  result = phi::funcs::DiagFill<T, phi::dtype::Real<T>>(
+      dev_ctx, m, m, m, 0, dout_w, result);
+  *dx = phi::Matmul<T>(dev_ctx, out_v, phi::Matmul<T>(dev_ctx, result, tV));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index ac7d6fd1a0e9ca..65427e87506f70 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace phi {
@@ -103,4 +106,430 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
   }
 }
 
+/*
+******************************
+    Divide Grad
+******************************
+*/
+
+template <typename T>
+struct DivGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
+};
+
+template <typename T>
+struct DivGradDX<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> y_conj(y.real, -y.imag);
+    return dout / y_conj;
+  }
+};
+
+template <typename T>
+struct DivGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return -dout * out / y;
+  }
+};
+
+template <typename T>
+struct DivGradDY<paddle::platform::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> out_div_y_conj((out / y).real, -(out / y).imag);
+    return -dout * out_div_y_conj;
+  }
+};
+
+template <typename T>
+struct DivDoubleDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return y * out * dout - x * dout;
+  }
+};
+
+template <typename T, typename Context>
+void DivideDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dx,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            int axis,
+                            DenseTensor* dy,
+                            DenseTensor* dout,
+                            DenseTensor* ddout) {
+  if (dy) {
+    dy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(dy);
+  }
+  if (dout) {
+    dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  // ddX_safe == null ? 0 : ddX
+  // ddY_safe == null ? 0 : ddY
+  DenseTensor ddX_safe, ddY_safe;
+  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, dx, ddx.get_ptr(), &ddX_safe);
+  phi::funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddY_safe);
+
+  // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
+  // dY = Out * dX * ddY / Y - dX * ddX / Y
+  // dOut = - dX * ddY
+  // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can
+  // inplace ddx
+  DenseTensor tmp;
+  if (dout) {
+    tmp = *dout;
+  } else {
+    tmp.Resize(out.dims());
+    dev_ctx.template Alloc<T>(&tmp);
+  }
+  if (dy) {
+    // dX_div_Y = dX / Y;
+    DenseTensor dX_div_Y = tmp;
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::DivideFunctor<T>,
+                                      funcs::InverseDivideFunctor<T>>(
+        dev_ctx, dx, y, &dX_div_Y, axis);
+
+    // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
+    // first output tensor is nullptr, the branch to calculate first
+    // output tensor will not be activated, DivGradDx function will not
+    // be called and can be ignored, the first branch has little effect
+    // on running speed.
+
+    // dY = Out * dX * ddY / Y - dX * ddX / Y
+    phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivDoubleDY<T>>(
+        dev_ctx,
+        ddX_safe,
+        ddY_safe,
+        out,
+        dX_div_Y,
+        axis,
+        nullptr,
+        dy,
+        DivGradDX<T>(),
+        DivDoubleDY<T>());
+  }
+
+  if (ddout) {
+    // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, out, ddY_safe, &tmp, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::SubtractFunctor<T>,
+                                      funcs::InverseSubtractFunctor<T>>(
+        dev_ctx, ddX_safe, tmp, &tmp, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::DivideFunctor<T>,
+                                      funcs::InverseDivideFunctor<T>>(
+        dev_ctx, tmp, y, ddout, axis);
+  }
+
+  if (dout) {
+    // dOut = - dX * ddY
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dx, ddY_safe, dout, axis);
+    auto& place = *dev_ctx.eigen_device();
+    auto dout_result = phi::EigenVector<T>::Flatten(*dout);
+    dout_result.device(place) = static_cast<T>(-1) * dout_result;
+  }
+}
+
+template <typename T>
+struct MulGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
+};
+
+template <typename T>
+struct MulGradDX<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> y_conj(y.real, -y.imag);
+    return dout * y_conj;
+  }
+};
+
+/*
+******************************
+    Multiply Grad
+******************************
+*/
+
+template <typename T>
+struct MulGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
+};
+
+template <typename T>
+struct MulGradDY<phi::dtype::complex<T>> {
+  HOSTDEVICE phi::dtype::complex<T> operator()(
+      phi::dtype::complex<T> x,
+      phi::dtype::complex<T> y,
+      phi::dtype::complex<T> out,
+      phi::dtype::complex<T> dout) const {
+    phi::dtype::complex<T> x_conj(x.real, -x.imag);
+    return dout * x_conj;
+  }
+};
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout) {
+  if (ddout) dev_ctx.template Alloc<T>(ddout);
+
+  DenseTensor ddx_safe, ddy_safe;
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, x, ddx.get_ptr(), &ddx_safe);
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddy_safe);
+
+  // dx = dout * ddy
+  // dy = dout * ddx
+  // ddout = ddx * y + x * ddy
+  // change computation sequence to save memory, so ddout can inplace ddx and
+  // dx can be used as 'tmp' tensor
+  // (1) dx = x * ddy
+  // (2) dy = dout * ddx
+  // (3) ddout = ddx * y
+  // (4) ddout = ddout + dx
+  // (5) dx = dout * ddy
+  if (ddout) {
+    auto& place = *dev_ctx.eigen_device();
+    // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace
+    if (ddout->numel() > ddx.get_ptr()->numel()) {
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          dx,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+
+      DenseTensor ddout_tmp;
+      ddout_tmp.Resize(ddout->dims());
+      dev_ctx.template Alloc<T>(&ddout_tmp);
+
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, y, ddx_safe, ddout, axis);
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddy_safe, x, &ddout_tmp, axis);
+
+      auto ddout_t = phi::EigenVector<T>::Flatten(*ddout);
+      auto ddout_tmp_t = phi::EigenVector<T>::Flatten(ddout_tmp);
+      ddout_t.device(place) = ddout_t + ddout_tmp_t;
+    } else {
+      // use dx to save memory, other than alloc tmp tensor
+      DenseTensor* ddout_tmp = dx;
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, x, ddy_safe, ddout_tmp, axis);
+      // NOTE: in the following ElemwiseGradCompute, for the
+      // first output tensor is nullptr, the branch to calculate first
+      // output tensor will not be activated, DivGradDx function will not
+      // be called and can be ignored, the first branch has little effect
+      // on running speed.
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          nullptr,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddx_safe, y, ddout, axis);
+
+      auto ddout_t = phi::EigenVector<T>::Flatten(*ddout);
+      auto ddout_tmp_t = phi::EigenVector<T>::Flatten(*ddout_tmp);
+      ddout_t.device(place) = ddout_t + ddout_tmp_t;
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, dout, ddy_safe, dx, axis);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy) {
+  if (d_x) {
+    d_x->Resize(x.dims());
+    dev_ctx.template Alloc<T>(d_x);
+  }
+  if (d_y) {
+    d_y->Resize(y.dims());
+    dev_ctx.template Alloc<T>(d_y);
+  }
+  if (d_dout) {
+    d_dout->Resize(dout.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_ddx) {
+    d_ddx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  if (d_ddy) {
+    d_ddy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(d_ddy);
+  }
+
+  auto& place = *dev_ctx.eigen_device();
+
+  DenseTensor ddx_safe, ddy_safe;
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, x, ddx.get_ptr(), &ddx_safe);
+  funcs::GetDoubleGradSafeTensor<Context, T>(
+      dev_ctx, y, ddy.get_ptr(), &ddy_safe);
+
+  if (d_ddout.get_ptr()) {
+    if (d_x) {
+      // d_x = ddy * d_ddout
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddy_safe, *(d_ddout.get_ptr()), d_x, axis);
+    }
+    if (d_y) {
+      // d_y = ddx * d_ddout
+      funcs::DefaultElementwiseOperator<Context,
+                                        T,
+                                        funcs::MultiplyFunctor<T>,
+                                        funcs::InverseMultiplyFunctor<T>>(
+          dev_ctx, ddx_safe, *(d_ddout.get_ptr()), d_y, axis);
+    }
+  }
+
+  if (d_dout) {
+    // get d_dout
+    // d_dout = ddy * d_dx + d_dy * ddx
+    DenseTensor d_dout_tmp;
+    d_dout_tmp.Resize(dout.dims());
+    dev_ctx.template Alloc<T>(&d_dout_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, d_dy, ddx_safe, d_dout, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, ddy_safe, d_dx, &d_dout_tmp, axis);
+    auto d_dout_t = phi::EigenVector<T>::Flatten(*d_dout);
+    auto d_dout_tmp_t = phi::EigenVector<T>::Flatten(d_dout_tmp);
+    d_dout_t.device(place) = d_dout_t + d_dout_tmp_t;
+  }
+
+  if (d_ddx) {
+    // get d_ddx
+    // d_ddx = dout * d_dy + y * d_ddout
+    DenseTensor d_ddx_tmp;
+    d_ddx_tmp.Resize(ddx->dims());
+    dev_ctx.template Alloc<T>(&d_ddx_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dout, d_dy, d_ddx, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, y, *(d_ddout.get_ptr()), &d_ddx_tmp, axis);
+    auto d_ddx_t = phi::EigenVector<T>::Flatten(*d_ddx);
+    auto d_ddx_tmp_t = phi::EigenVector<T>::Flatten(d_ddx_tmp);
+    d_ddx_t.device(place) = d_ddx_t + d_ddx_tmp_t;
+  }
+
+  if (d_ddy) {
+    // get d_ddy
+    // d_ddy = dout * d_dx + x * d_ddout
+    DenseTensor d_ddy_tmp;
+    d_ddy_tmp.Resize(ddy->dims());
+    dev_ctx.template Alloc<T>(&d_ddy_tmp);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, dout, d_dx, d_ddy, axis);
+    funcs::DefaultElementwiseOperator<Context,
+                                      T,
+                                      funcs::MultiplyFunctor<T>,
+                                      funcs::InverseMultiplyFunctor<T>>(
+        dev_ctx, x, *(d_ddout.get_ptr()), &d_ddy_tmp, axis);
+    auto d_ddy_t = phi::EigenVector<T>::Flatten(*d_ddy);
+    auto d_ddy_tmp_t = phi::EigenVector<T>::Flatten(d_ddy_tmp);
+    d_ddy_t.device(place) = d_ddy_t + d_ddy_tmp_t;
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/isfinite_kernel_impl.h b/paddle/phi/kernels/impl/isfinite_kernel_impl.h
new file mode 100644
index 00000000000000..affa85f8a2d28e
--- /dev/null
+++ b/paddle/phi/kernels/impl/isfinite_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/isfinite_functor.h"
+#include "paddle/phi/kernels/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteKernelImpl(const Context& ctx,
+                               const DenseTensor& x,
+                               DenseTensor* out);
+
+#define DEFINE_ISFINITE_KERNEL(isfinite_kernel, functor)            \
+  template <typename T, typename Context>                           \
+  void isfinite_kernel(                                             \
+      const Context& ctx, const DenseTensor& x, DenseTensor* out) { \
+    IsfiniteKernelImpl<T, Context, functor>(ctx, x, out);           \
+  }
+
+DEFINE_ISFINITE_KERNEL(IsinfKernel, funcs::InfinityV2Functor)
+DEFINE_ISFINITE_KERNEL(IsnanKernel, funcs::NANV2Functor)
+DEFINE_ISFINITE_KERNEL(IsfiniteKernel, funcs::IsfiniteV2Functor)
+#undef DEFINE_ISFINITE_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 7c8d10e05653d0..495b93f2a4ef0f 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -59,9 +59,8 @@ struct ReduceSumForMatmulGrad<GPUContext, T> {
                   const DenseTensor& input,
                   DenseTensor* output,
                   const std::vector<int>& reduce_dims) {
-    auto stream = dev_ctx.stream();
-    funcs::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims);
   }
 };
 #endif
@@ -329,8 +328,8 @@ void MatmulGradKernel(const Context& dev_ctx,
     x_conj = Conj<T>(dev_ctx, x);
     y_conj = Conj<T>(dev_ctx, y);
 
-    DenseTensor dx_help = Empty<T, Context>(dev_ctx);
-    DenseTensor dy_help = Empty<T, Context>(dev_ctx);
+    DenseTensor dx_help;
+    DenseTensor dy_help;
 
     if (transpose_x) {
       if (transpose_y) {
@@ -686,8 +685,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
       y_conj = Conj<T>(dev_ctx, y);
     }
 
-    DenseTensor dx_help = Empty<T>(dev_ctx);
-    DenseTensor dy_help = Empty<T>(dev_ctx);
+    DenseTensor dx_help;
+    DenseTensor dy_help;
 
     if (transpose_x) {
       if (transpose_y) {
@@ -1373,10 +1372,10 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
     VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
                "wastes the memory. So we should avoid the case in reality";
 
-    DenseTensor out_dx_help = Empty<T>(dev_ctx);
-    DenseTensor out_dy_help = Empty<T>(dev_ctx);
-    DenseTensor out_d_ddx_help = Empty<T>(dev_ctx);
-    DenseTensor out_d_ddy_help = Empty<T>(dev_ctx);
+    DenseTensor out_dx_help;
+    DenseTensor out_dy_help;
+    DenseTensor out_d_ddx_help;
+    DenseTensor out_d_ddy_help;
 
     if (out_d_dout) {
       ddx_conj = Conj<T>(dev_ctx, ddx);
diff --git a/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
new file mode 100644
index 00000000000000..e797b27071caca
--- /dev/null
+++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
@@ -0,0 +1,200 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+void MatrixPowerGradFunction(const DenseTensor* X,
+                             const DenseTensor* Out,
+                             const DenseTensor* dOut,
+                             const int n,
+                             DenseTensor* dX,
+                             const Context& ctx) {
+  ctx.template Alloc<T>(dX);
+  const auto& x_dims = X->dims();
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  if (n == 0) {
+    // \nabla X = O
+    phi::funcs::SetConstant<Context, T> zero;
+    zero(ctx, dX, static_cast<T>(0));
+    return;
+  } else if (n == 1) {
+    // \nabla X = \nabla Out
+    paddle::framework::TensorCopy(*dOut, ctx.GetPlace(), ctx, dX);
+    return;
+  }
+
+  auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true);
+  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
+
+  if (n == -1) {
+    // \nabla X = Out^{T} * \nabla Out * Out^{T}
+    DenseTensor temp_dx;
+    temp_dx.Resize(X->dims());
+    ctx.template Alloc<T>(&temp_dx);
+    blas.MatMul(*Out,
+                trans_desc,
+                *dOut,
+                no_trans_desc,
+                static_cast<T>(-1),
+                &temp_dx,
+                static_cast<T>(0));
+    blas.MatMul(temp_dx,
+                no_trans_desc,
+                *Out,
+                trans_desc,
+                static_cast<T>(1),
+                dX,
+                static_cast<T>(0));
+    return;
+  }
+
+  DenseTensor new_x;
+  new_x.Resize(X->dims());
+  ctx.template Alloc<T>(&new_x);
+  int new_n = n;
+  if (n > 0) {
+    // newX = X
+    paddle::framework::TensorCopy(*X, ctx.GetPlace(), ctx, &new_x);
+  } else {
+    // newX = X^{-1}, n = -n
+    phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+    mat_inv(ctx, *X, &new_x);
+    new_n = -n;
+  }
+
+  // Use chain rule blow to compute \nabla newX^{n}
+  // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1},
+  // Note that newX^{0} can be omitted
+  std::vector<std::shared_ptr<DenseTensor>> tensor_list(new_n - 1);
+  tensor_list[0] = std::make_shared<DenseTensor>(new_x);
+  int index = 1;
+  while (index < new_n - 1) {
+    DenseTensor tensor_list_index;
+    tensor_list_index.Resize(X->dims());
+    ctx.template Alloc<T>(&tensor_list_index);
+    tensor_list[index] = std::make_shared<DenseTensor>(tensor_list_index);
+
+    blas.MatMul(*tensor_list[index - 1],
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                tensor_list[index].get(),
+                static_cast<T>(0));
+    index++;
+  }
+
+  // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i}
+  //                      * \nabla Out
+  //                      * (newX^{T}^{n - i - 1})
+  DenseTensor dx_new;
+  dx_new.Resize(X->dims());
+  ctx.template Alloc<T>(&dx_new);
+  blas.MatMul(*tensor_list[new_n - 2],
+              trans_desc,
+              *dOut,
+              no_trans_desc,
+              static_cast<T>(1),
+              &dx_new,
+              static_cast<T>(0));
+  DenseTensor da_an_minus1;
+  da_an_minus1.Resize(X->dims());
+  ctx.template Alloc<T>(&da_an_minus1);
+  blas.MatMul(*dOut,
+              no_trans_desc,
+              *tensor_list[new_n - 2],
+              trans_desc,
+              static_cast<T>(1),
+              &da_an_minus1,
+              static_cast<T>(0));
+  blas.AXPY(
+      X->numel(), static_cast<T>(1), da_an_minus1.data<T>(), dx_new.data<T>());
+  int start = 0;
+  while (start < new_n - 2) {
+    DenseTensor a_da;
+    a_da.Resize(X->dims());
+    ctx.template Alloc<T>(&a_da);
+    DenseTensor a_da_a;
+    a_da_a.Resize(X->dims());
+    ctx.template Alloc<T>(&a_da_a);
+    blas.MatMul(*tensor_list[start],
+                trans_desc,
+                *dOut,
+                no_trans_desc,
+                static_cast<T>(1),
+                &a_da,
+                static_cast<T>(0));
+    blas.MatMul(a_da,
+                no_trans_desc,
+                *tensor_list[new_n - 3 - start],
+                trans_desc,
+                static_cast<T>(1),
+                &a_da_a,
+                static_cast<T>(0));
+    blas.AXPY(
+        X->numel(), static_cast<T>(1), a_da_a.data<T>(), dx_new.data<T>());
+    start++;
+  }
+
+  if (n > 0) {
+    // \nabla X = \nabla newX
+    paddle::framework::TensorCopy(dx_new, ctx.GetPlace(), ctx, dX);
+  } else {
+    // \nabla X = newX^{T} * \nabla newX * newX^{T}
+    DenseTensor temp_dx;
+    temp_dx.Resize(X->dims());
+    ctx.template Alloc<T>(&temp_dx);
+    blas.MatMul(new_x,
+                trans_desc,
+                dx_new,
+                no_trans_desc,
+                static_cast<T>(-1),
+                &temp_dx,
+                static_cast<T>(0));
+    blas.MatMul(temp_dx,
+                no_trans_desc,
+                new_x,
+                trans_desc,
+                static_cast<T>(1),
+                dX,
+                static_cast<T>(0));
+  }
+  return;
+}
+
+template <typename T, typename Context>
+void MatrixPowerGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           int n,
+                           DenseTensor* x_grad) {
+  auto X = &x;
+  auto Out = &out;
+  auto dOut = &out_grad;
+  auto dX = x_grad;
+
+  MatrixPowerGradFunction<Context, T>(X, Out, dOut, n, dX, ctx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
new file mode 100644
index 00000000000000..ccc5e8757e8766
--- /dev/null
+++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
@@ -0,0 +1,203 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+
+namespace phi {
+
+template <typename T>
+struct IdentityMatrixFunctor {
+  IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {}
+
+  HOSTDEVICE void operator()(size_t index) const {
+    const int row = index / m_ % m_;
+    const int col = index % m_;
+    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
+  }
+
+  const int m_;
+  T* output_;
+};
+
+template <typename Context, typename T>
+void MatrixPowerFunction(const DenseTensor* X,
+                         const int n,
+                         DenseTensor* Out,
+                         const Context& ctx) {
+  const auto& x_dims = X->dims();
+  const int x_ndim = x_dims.size();
+  T* out_data = ctx.template Alloc<T>(Out);
+
+  phi::funcs::ForRange<Context> for_range(ctx, X->numel());
+
+  if (n == 0) {
+    // Out = Identity Matrix
+    IdentityMatrixFunctor<T> functor(x_dims[x_ndim - 1], out_data);
+    for_range(functor);
+    return;
+  }
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  DenseTensor new_x;
+  new_x.Resize(X->dims());
+  ctx.template Alloc<T>(&new_x);
+  int new_n = n;
+  if (n > 0) {
+    // newX = X
+    paddle::framework::TensorCopy(*X, ctx.GetPlace(), ctx, &new_x);
+  } else {
+    // newX = X^{-1}, n = -n
+    phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+    mat_inv(ctx, *X, &new_x);
+    new_n = -n;
+  }
+
+  if (new_n == 1) {
+    paddle::framework::TensorCopy(new_x, ctx.GetPlace(), ctx, Out);
+    return;
+  }
+
+  auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false);
+
+  if (new_n == 2) {
+    // Out = newX * newX
+    ctx.template Alloc<T>(Out);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  } else if (new_n == 3) {
+    // Out = (newX * newX) * newX
+    // Note: C[i] matrices in MatMul must not overlap, i.e. the individual
+    // gemm operations must be computable independently; otherwise,
+    // undefined behavior is expected.
+    DenseTensor temp;
+    temp.Resize(X->dims());
+    ctx.template Alloc<T>(&temp);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                &temp,
+                static_cast<T>(0));
+    blas.MatMul(temp,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  } else if (new_n == 4) {
+    // Out = (newX * newX) * (newX * newX)
+    DenseTensor temp;
+    temp.Resize(X->dims());
+    ctx.template Alloc<T>(&temp);
+    blas.MatMul(new_x,
+                no_trans_desc,
+                new_x,
+                no_trans_desc,
+                static_cast<T>(1),
+                &temp,
+                static_cast<T>(0));
+    blas.MatMul(temp,
+                no_trans_desc,
+                temp,
+                no_trans_desc,
+                static_cast<T>(1),
+                Out,
+                static_cast<T>(0));
+    return;
+  }
+
+  // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN)
+  int bit = 0;
+  DenseTensor z = DenseTensor(X->dtype());
+  bool out_inited = false;
+  DenseTensor temp_out;
+  temp_out.Resize(X->dims());
+  ctx.template Alloc<T>(&temp_out);
+  DenseTensor temp_z;
+  temp_z.Resize(X->dims());
+  ctx.template Alloc<T>(&temp_z);
+  while (new_n > 0) {
+    bit = new_n & 0x1;
+    new_n >>= 1;
+    if (z.IsInitialized()) {
+      blas.MatMul(z,
+                  no_trans_desc,
+                  z,
+                  no_trans_desc,
+                  static_cast<T>(1),
+                  &temp_z,
+                  static_cast<T>(0));
+      paddle::framework::TensorCopy(temp_z, ctx.GetPlace(), ctx, &z);
+    } else {
+      z.Resize(X->dims());
+      ctx.template Alloc<T>(&z);
+      paddle::framework::TensorCopy(new_x, ctx.GetPlace(), ctx, &z);
+    }
+    if (bit == 1) {
+      if (out_inited == true) {
+        blas.MatMul(*Out,
+                    no_trans_desc,
+                    z,
+                    no_trans_desc,
+                    static_cast<T>(1),
+                    &temp_out,
+                    static_cast<T>(0));
+        paddle::framework::TensorCopy(temp_out, ctx.GetPlace(), ctx, Out);
+      } else {
+        paddle::framework::TensorCopy(z, ctx.GetPlace(), ctx, Out);
+        out_inited = true;
+      }
+    }
+  }
+  return;
+}
+
+template <typename T, typename Context>
+void MatrixPowerKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       int n,
+                       DenseTensor* out) {
+  const DenseTensor* X = &x;
+  auto Out = out;
+
+  const auto& x_dims = X->dims();
+  const int x_ndim = x_dims.size();
+  PADDLE_ENFORCE_EQ(
+      x_dims[x_ndim - 2],
+      x_dims[x_ndim - 1],
+      errors::InvalidArgument(
+          "The inner-most 2 dimensions of Input(X) should be equal."
+          "X's shape[-2] = %d and shape[-1] = %d.",
+          x_dims[x_ndim - 2],
+          x_dims[x_ndim - 1]));
+
+  MatrixPowerFunction<Context, T>(X, n, Out, ctx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
new file mode 100644
index 00000000000000..546ea74674281b
--- /dev/null
+++ b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/maxout_grad_kernel.h"
+
+#include "paddle/fluid/operators/math/maxouting.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      int groups,
+                      int axis,
+                      DenseTensor* x_grad) {
+  if (axis < 0) {
+    axis += x.dims().size();
+  }
+
+  phi::funcs::SetConstant<Context, T> zero;
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+    zero(dev_ctx, x_grad, static_cast<T>(0.0));
+    paddle::operators::math::MaxOutGradFunctor<Context, T> maxout_backward;
+    maxout_backward(dev_ctx, x, x_grad, out, out_grad, groups, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/maxout_kernel_impl.h b/paddle/phi/kernels/impl/maxout_kernel_impl.h
new file mode 100644
index 00000000000000..da8c259ebf2171
--- /dev/null
+++ b/paddle/phi/kernels/impl/maxout_kernel_impl.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/maxout_kernel.h"
+
+#include "paddle/fluid/operators/math/maxouting.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int groups,
+                  int axis,
+                  DenseTensor* out) {
+  if (axis < 0) {
+    axis += x.dims().size();
+  }
+
+  paddle::operators::math::MaxOutFunctor<Context, T> maxout_forward;
+  maxout_forward(dev_ctx, x, out, groups, axis);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
new file mode 100644
index 00000000000000..0833e94fe2c189
--- /dev/null
+++ b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
@@ -0,0 +1,456 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename Context, typename T>
+inline DenseTensor MatMul(const Context& ctx,
+                          const DenseTensor& matrix_a,
+                          const DenseTensor& matrix_b,
+                          const phi::DDim& a_dim,
+                          const phi::DDim& b_dim) {
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  DenseTensor matrix_c;
+  phi::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]});
+  matrix_c.Resize(c_dim);
+  ctx.template Alloc<T>(&matrix_c);
+
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false);
+  const T alpha = static_cast<T>(1.0);
+  blas.MatMul(matrix_a.data<T>(),
+              mat_dim_a,
+              matrix_b.data<T>(),
+              mat_dim_b,
+              alpha,
+              matrix_c.data<T>(),
+              T(0));
+  return matrix_c;
+}
+
+/**
+ * @brief Recursively calculate matrix multiplication according to the optimal
+ * order
+ * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j]
+ *
+ * @param
+ * ins: the input tensors
+ * ins_dims: the shape of ins after reshape
+ * order: the optimal order
+ * i: the left of sub chain
+ * j: the righe of sub chain
+ * save_result: set true by backward
+ * results: save the intermediate result during backward
+ */
+template <typename Context, typename T>
+inline DenseTensor MatChainMul(const Context& ctx,
+                               const std::vector<const DenseTensor*>& ins,
+                               const std::vector<phi::DDim>& ins_dims,
+                               const std::vector<uint64_t>& order,
+                               const uint64_t i,
+                               const uint64_t j,
+                               const bool save_result,
+                               std::vector<DenseTensor>* results) {
+  if (i == j) {
+    return *ins[i];
+  }
+
+  const auto A = MatChainMul<Context, T>(ctx,
+                                         ins,
+                                         ins_dims,
+                                         order,
+                                         i,
+                                         order[i * ins.size() + j],
+                                         save_result,
+                                         results);
+  phi::DDim a_dim = A.dims();
+  if (i == order[i * ins.size() + j]) {
+    a_dim = ins_dims[i];
+  }
+
+  const auto B = MatChainMul<Context, T>(ctx,
+                                         ins,
+                                         ins_dims,
+                                         order,
+                                         order[i * ins.size() + j] + 1,
+                                         j,
+                                         save_result,
+                                         results);
+  phi::DDim b_dim = B.dims();
+  if (j == order[i * ins.size() + j] + 1) {
+    b_dim = ins_dims[j];
+  }
+
+  auto result = MatMul<Context, T>(ctx, A, B, a_dim, b_dim);
+  if (save_result) {
+    (*results)[i * ins.size() + j] = result;
+  }
+  return result;
+}
+
+/**
+ * @brief get the optimal order
+ */
+template <typename Context, typename T>
+std::vector<uint64_t> GetOrder(const std::vector<const DenseTensor*>& ins,
+                               const std::vector<phi::DDim>& ins_dims) {
+  auto n = ins.size();
+  // p: save the ins shape, the ins[i] shape is (p[i], p[i+1])
+  std::vector<uint64_t> p(n + 1);
+  for (uint64_t i = 0; i < n; i++) {
+    p[i] = ins_dims[i][0];
+  }
+  p[n] = ins_dims[n - 1][1];
+
+  // m[i, j]: save the lowest cost for multiplying ins[i...j]
+  std::vector<uint64_t> m(n * n, 0);
+  // define ins[i...j] means multiplying matrices from ins[i] to ins[j]
+  // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then
+  // multiply the resulting matrices is the optimal order for ins[i...j]
+  std::vector<uint64_t> order(n * n);
+  for (uint64_t l = 1; l < n; l++) {
+    for (uint64_t i = 0; i < n - l; i++) {
+      auto j = i + l;
+      m[i * n + j] = 0xffffffff;
+      for (uint64_t k = i; k < j; k++) {
+        uint64_t q =
+            m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1];
+        if (q < m[i * n + j]) {
+          m[i * n + j] = q;
+          order[i * n + j] = k;
+        }
+      }
+    }
+  }
+  return order;
+}
+
+template <typename Context, typename T>
+static inline DenseTensor MultiDotMatChainOrder(
+    const Context& ctx,
+    const std::vector<const DenseTensor*>& ins,
+    const std::vector<phi::DDim>& ins_dims,
+    const bool save_result,
+    std::vector<DenseTensor>* results) {
+  auto order = GetOrder<Context, T>(ins, ins_dims);
+  return MatChainMul<Context, T>(
+      ctx, ins, ins_dims, order, 0, ins.size() - 1, save_result, results);
+}
+
+template <typename Context, typename T>
+inline void GetDims(const std::vector<const DenseTensor*>& ins,
+                    std::vector<phi::DDim>* ins_dims) {
+  const auto n = ins.size();
+  for (size_t i = 0; i < n; i++) {
+    (*ins_dims)[i] = ins[i]->dims();
+    if (i == 0 && (*ins_dims)[i].size() == 1) {
+      (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]});
+    } else if (i == n - 1 && (*ins_dims)[i].size() == 1) {
+      (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1});
+    }
+  }
+}
+
+template <typename T, typename Context>
+void MultiDotKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& x,
+                    DenseTensor* out) {
+  auto ins = x;
+  ctx.template Alloc<T>(out);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  auto n = ins.size();
+  std::vector<phi::DDim> ins_dims(n);
+  GetDims<Context, T>(ins, &ins_dims);
+
+  const T scale = static_cast<T>(1.0);
+  if (n == 2) {
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
+  } else if (n == 3) {
+    const auto Ma = ins_dims[0][0];
+    const auto Ka = ins_dims[0][1];
+    const auto Nb = ins_dims[1][1];
+    const auto Nc = ins_dims[2][1];
+    const uint64_t cost1 = Ma * Nb * (Ka + Nc);
+    const uint64_t cost2 = Ka * Nc * (Nb + Ma);
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
+    if (cost1 < cost2) {
+      DenseTensor tmp_out;
+      phi::DDim tmp_dim = phi::make_ddim({Ma, Nb});
+      tmp_out.Resize(tmp_dim);
+      ctx.template Alloc<T>(&tmp_out);
+      blas.MatMul(
+          *ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, T(0));
+      auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
+      blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
+    } else {
+      DenseTensor tmp_out;
+      phi::DDim tmp_dim = phi::make_ddim({Ka, Nc});
+      tmp_out.Resize(tmp_dim);
+      ctx.template Alloc<T>(&tmp_out);
+      std::cout << tmp_out << std::endl;
+      blas.MatMul(
+          *ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, T(0));
+      auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
+      blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
+    }
+  } else {
+    std::vector<DenseTensor> results;
+    const auto tmp =
+        MultiDotMatChainOrder<Context, T>(ctx, ins, ins_dims, false, &results);
+    auto out_dim = out->dims();
+    *out = tmp;
+    out->Resize(out_dim);
+  }
+}
+
+/**
+ * @brief calculate dA and dB
+ * dA = dout * transpose(B)
+ * dB = transpose(A) * dout
+ */
+template <typename Context, typename T>
+void CalcGrad(const Context& ctx,
+              const DenseTensor& dout,
+              const DenseTensor& A,
+              const DenseTensor& B,
+              const phi::DDim& dout_dim,
+              const phi::DDim& a_dim,
+              const phi::DDim& b_dim,
+              DenseTensor* dA,
+              DenseTensor* dB) {
+  auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true);
+  T alpha = static_cast<T>(1.0);
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+  blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
+  blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
+}
+
+/**
+ * @brief calculate multi matrix multiplication grad by a chain order
+ * @param
+ * dout: the grad of multi matrix multiplication out
+ * dx: the out grad of inputs
+ * ins: the input tensors
+ * ins_dims: the shape of ins after reshape
+ * order: the optimal order
+ * i: the left of sub chain
+ * j: the righe of sub chain
+ * results: the intermediate result of farward
+ */
+template <typename Context, typename T>
+void MatChainMulGrad(const Context& ctx,
+                     const DenseTensor& dout,
+                     std::vector<DenseTensor*>* dx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const phi::DDim& dout_dim,
+                     const std::vector<phi::DDim>& ins_dims,
+                     const std::vector<uint64_t>& order,
+                     const uint64_t i,
+                     const uint64_t j,
+                     const std::vector<DenseTensor>& results) {
+  if (i == j) {
+    *((*dx)[i]) = dout;
+    return;
+  }
+
+  const auto n = ins.size();
+  const auto right = order[i * n + j];
+  const auto left = order[i * n + j] + 1;
+  // get the multi result of left sub chain
+  const auto* A = &results[i * n + right];
+  phi::DDim a_dim = A->dims();
+  if (i == right) {
+    A = ins[i];
+    a_dim = ins_dims[i];
+  }
+  // get the multi result of right sub chain
+  const auto* B = &results[left * n + j];
+  phi::DDim b_dim = B->dims();
+  if (left == j) {
+    B = ins[j];
+    b_dim = ins_dims[j];
+  }
+  DenseTensor dA, dB;
+  dA.Resize({dout_dim[0], b_dim[0]});
+  dB.Resize({a_dim[1], dout_dim[1]});
+  ctx.template Alloc<T>(&dA);
+  ctx.template Alloc<T>(&dB);
+
+  CalcGrad<Context, T>(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB);
+  MatChainMulGrad<Context, T>(
+      ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, results);
+  MatChainMulGrad<Context, T>(
+      ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, results);
+}
+
+template <typename Context, typename T>
+void MultiDotGradMatChainOrder(const Context& ctx,
+                               const DenseTensor& dout,
+                               const std::vector<const DenseTensor*>& ins,
+                               const phi::DDim& dout_dim,
+                               const std::vector<phi::DDim>& ins_dims,
+                               std::vector<DenseTensor*>* dx) {
+  auto order = GetOrder<Context, T>(ins, ins_dims);
+  auto n = ins.size();
+  std::vector<DenseTensor> results(n * n);
+  MatChainMul<Context, T>(ctx, ins, ins_dims, order, 0, n - 1, true, &results);
+  MatChainMulGrad<Context, T>(
+      ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, results);
+}
+
+template <typename T, typename Context>
+void MultiDotGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        const std::vector<const DenseTensor*>& x,
+                        std::vector<DenseTensor*> x_grad) {
+  auto ins = x;
+  auto dout = out_grad;
+  auto dx = x_grad;
+
+  auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+
+  const auto n = ins.size();
+  for (size_t i = 0; i < n; i++) {
+    ctx.template Alloc<T>(dx[i]);
+  }
+
+  std::vector<phi::DDim> ins_dims(n);
+  GetDims<Context, T>(ins, &ins_dims);
+
+  phi::DDim dout_dim = dout.dims();
+  if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) {
+    dout_dim = phi::make_ddim({1, 1});
+  } else if (ins[0]->dims().size() == 1) {
+    if (dout_dim.size() == 1) {
+      dout_dim = phi::make_ddim({1, dout_dim[0]});
+    }
+  } else if (ins[n - 1]->dims().size() == 1) {
+    if (dout_dim.size() == 1) {
+      dout_dim = phi::make_ddim({dout_dim[0], 1});
+    }
+  }
+
+  T alpha = static_cast<T>(1);
+  auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
+  if (n == 2) {
+    CalcGrad<Context, T>(ctx,
+                         dout,
+                         *ins[0],
+                         *ins[1],
+                         dout_dim,
+                         ins_dims[0],
+                         ins_dims[1],
+                         dx[0],
+                         dx[1]);
+  } else if (n == 3) {
+    const auto Ma = ins_dims[0][0];
+    const auto Ka = ins_dims[0][1];
+    const auto Nb = ins_dims[1][1];
+    const auto Nc = ins_dims[2][1];
+    const uint64_t cost1 = Ma * Nb * (Ka + Nc);
+    const uint64_t cost2 = Ka * Nc * (Nb + Ma);
+    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+    auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
+    if (cost1 < cost2) {
+      DenseTensor tmp_out, tmp_dout;
+      tmp_out.Resize({Ma, Nb});
+      ctx.template Alloc<T>(&tmp_out);
+      tmp_dout.Resize({mat_dim_dout.height_, Nb});
+      ctx.template Alloc<T>(&tmp_dout);
+      blas.MatMul(
+          *ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, T(0));
+      CalcGrad<Context, T>(ctx,
+                           dout,
+                           tmp_out,
+                           *ins[2],
+                           dout_dim,
+                           tmp_out.dims(),
+                           ins_dims[2],
+                           &tmp_dout,
+                           dx[2]);
+      CalcGrad<Context, T>(ctx,
+                           tmp_dout,
+                           *ins[0],
+                           *ins[1],
+                           tmp_dout.dims(),
+                           ins_dims[0],
+                           ins_dims[1],
+                           dx[0],
+                           dx[1]);
+    } else {
+      DenseTensor tmp_out, tmp_dout;
+      tmp_out.Resize({Ka, Nc});
+      ctx.template Alloc<T>(&tmp_out);
+      tmp_dout.Resize({Ka, mat_dim_dout.width_});
+      ctx.template Alloc<T>(&tmp_dout);
+      blas.MatMul(
+          *ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, T(0));
+      CalcGrad<Context, T>(ctx,
+                           dout,
+                           *ins[0],
+                           tmp_out,
+                           dout_dim,
+                           ins_dims[0],
+                           tmp_dout.dims(),
+                           dx[0],
+                           &tmp_dout);
+      CalcGrad<Context, T>(ctx,
+                           tmp_dout,
+                           *ins[1],
+                           *ins[2],
+                           tmp_dout.dims(),
+                           ins_dims[1],
+                           ins_dims[2],
+                           dx[1],
+                           dx[2]);
+    }
+  } else {
+    MultiDotGradMatChainOrder<Context, T>(
+        ctx, dout, ins, dout_dim, ins_dims, &dx);
+    if (ins[n - 1]->dims().size() == 1) {
+      dx[n - 1]->Resize({dx[n - 1]->dims()[0]});
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
new file mode 100644
index 00000000000000..4ba1a0c6b6c0fe
--- /dev/null
+++ b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& segment_ids,
+                           const DenseTensor& out,
+                           paddle::optional<const DenseTensor&> summed_ids,
+                           const DenseTensor& out_grad,
+                           const std::string& pooltype,
+                           DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0));
+
+  auto index_type = segment_ids.type();
+  if (index_type == DataType::INT32) {
+    phi::funcs::SegmentPoolGradFunctor<Context, T, int> pool;
+    pool(dev_ctx, x, out, out_grad, segment_ids, x_grad, summed_ids, pooltype);
+  } else if (index_type == DataType::INT64) {
+    phi::funcs::SegmentPoolGradFunctor<Context, T, int64_t> pool;
+    pool(dev_ctx, x, out, out_grad, segment_ids, x_grad, summed_ids, pooltype);
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported index type, Expected int, int64, but got %s.",
+        index_type));
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
new file mode 100644
index 00000000000000..8a6df37ab3e35c
--- /dev/null
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/segment_pooling.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void SegmentKernelLaunchHelper(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& segment_ids,
+                               const std::string& pooltype,
+                               DenseTensor* out,
+                               DenseTensor* summed_ids) {
+  int64_t num_indices = segment_ids.numel();
+  PADDLE_ENFORCE_EQ(
+      num_indices,
+      x.dims()[0],
+      phi::errors::InvalidArgument(
+          "Segment_ids should be the same size as dimension 0 of input X."));
+  PADDLE_ENFORCE_EQ(num_indices,
+                    segment_ids.dims()[0],
+                    phi::errors::InvalidArgument(
+                        "Segment_ids should be 1-D tensor, or it's other "
+                        "dimension size is 1. Segment_ids's shape is: [%s].",
+                        segment_ids.dims()));
+
+  if (x.numel() == 0 || segment_ids.numel() == 0) {
+    return;
+  }
+
+  bool cpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU;
+  if (cpu_place) {
+    auto dims = x.dims();
+    auto* segment_ids_ptr = segment_ids.data<IndexT>();
+    dims[0] =
+        static_cast<int64_t>(segment_ids_ptr[segment_ids.numel() - 1] + 1);
+    PADDLE_ENFORCE_GT(
+        dims[0],
+        0,
+        phi::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", dims[0]));
+
+    out->Resize({dims});
+    dev_ctx.template Alloc<T>(out);
+
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, out, static_cast<T>(0));
+  }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (!cpu_place) {
+    DenseTensor length;
+    length.Resize(phi::make_ddim({1}));
+    IndexT* length_data = dev_ctx.template HostAlloc<IndexT>(&length);
+
+    const IndexT* segment_ids_ptr = segment_ids.data<IndexT>();
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(length_data,
+                                         segment_ids_ptr + num_indices - 1,
+                                         sizeof(IndexT),
+                                         hipMemcpyDeviceToHost));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(length_data,
+                                          segment_ids_ptr + num_indices - 1,
+                                          sizeof(IndexT),
+                                          cudaMemcpyDeviceToHost));
+#endif
+
+    IndexT length_host = length_data[0];
+    length_host++;
+    PADDLE_ENFORCE_GT(
+        length_host,
+        0,
+        phi::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", length_data[0]));
+    auto dims = x.dims();
+    dims[0] = static_cast<int64_t>(length_host);
+    out->Resize({dims});
+    dev_ctx.template Alloc<T>(out);
+
+    T init_value = 0;
+    if (pooltype == "MAX") {
+      init_value = static_cast<T>(-FLT_MAX);
+    } else if (pooltype == "MIN") {
+      init_value = static_cast<T>(FLT_MAX);
+    }
+    phi::funcs::SetConstant<Context, T> setconst;
+    setconst(dev_ctx, out, static_cast<T>(init_value));
+    // the gpu kernel of mean pool record the counts of segment_ids
+    if (pooltype == "MEAN") {
+      summed_ids->Resize({dims[0], 1});
+      dev_ctx.template Alloc<T>(summed_ids);
+      setconst(dev_ctx, summed_ids, static_cast<T>(1e-12));
+    }
+  }
+#endif
+
+  phi::funcs::SegmentPoolFunctor<Context, T, IndexT> pool;
+
+  pool(dev_ctx, x, segment_ids, out, summed_ids, pooltype);
+}
+
+template <typename T, typename Context>
+void SegmentPoolKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& segment_ids,
+                       const std::string& pooltype,
+                       DenseTensor* out,
+                       DenseTensor* summed_ids) {
+  auto index_type = segment_ids.dtype();
+  if (index_type == DataType::INT32) {
+    SegmentKernelLaunchHelper<Context, T, int>(
+        dev_ctx, x, segment_ids, pooltype, out, summed_ids);
+  } else if (index_type == DataType::INT64) {
+    SegmentKernelLaunchHelper<Context, T, int64_t>(
+        dev_ctx, x, segment_ids, pooltype, out, summed_ids);
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupported index type, Expected int, int64, but got %s.",
+        index_type));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h
new file mode 100644
index 00000000000000..5aebffe51b5e38
--- /dev/null
+++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h
@@ -0,0 +1,337 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/slice_utils.h"
+
+namespace phi {
+
+// check whether the tensor with dimension of second can assign to the
+// tensor with dimension of first
+inline void CheckIsDimsMatch(const DDim& first, const DDim& second) {
+  int ignore_axis1 = 0, ignore_axis2 = 0;
+  for (; ignore_axis1 < first.size(); ++ignore_axis1) {
+    if (first[ignore_axis1] != 1) {
+      break;
+    }
+  }
+  for (; ignore_axis2 < second.size(); ++ignore_axis2) {
+    if (second[ignore_axis2] != 1) {
+      break;
+    }
+  }
+
+  if (second.size() == ignore_axis2) {
+    // second tensor has only one value
+    return;
+  }
+
+  if (first.size() - ignore_axis1 >= second.size() - ignore_axis2) {
+    auto idx1 = first.size() - 1;
+    auto idx2 = second.size() - 1;
+    bool is_match = true;
+    for (; idx2 >= ignore_axis2; idx2--) {
+      if (first[idx1--] != second[idx2] && second[idx2] != 1) {
+        is_match = false;
+        break;
+      }
+    }
+    if (is_match) {
+      return;
+    }
+  }
+  PADDLE_THROW(errors::InvalidArgument(
+      "The shape of tensor assigned value must match the shape "
+      "of target shape: %d, but now shape is %d.",
+      second.to_str(),
+      first.to_str()));
+}
+
+template <typename T, typename Context, size_t RANK>
+void SetValueImpl(const Context& dev_ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& value,
+                  const ScalarArray& starts,
+                  const ScalarArray& ends,
+                  const ScalarArray& steps,
+                  const std::vector<int64_t>& axes,
+                  const std::vector<int64_t>& decrease_axes,
+                  const std::vector<int64_t>& none_axes,
+                  DenseTensor* out) {
+  auto in_dims = in.dims();
+  std::vector<int64_t> starts_local = starts.GetData();
+  std::vector<int64_t> ends_local = ends.GetData();
+  std::vector<int64_t> steps_local = steps.GetData();
+  paddle::operators::CheckAndUpdateSliceAttrs(
+      in_dims, axes, &starts_local, &ends_local, &steps_local);
+  auto slice_dims = paddle::operators::GetSliceDims(
+      in_dims, axes, starts_local, ends_local, &steps_local);
+  auto decrease_slice_dims =
+      paddle::operators::GetDecreasedDims(slice_dims, decrease_axes);
+
+  auto slice_dims_for_assign = decrease_slice_dims;
+  if (!none_axes.empty()) {
+    std::vector<int64_t> slice_dims_with_none;
+
+    size_t none_axes_cur = 0, decrease_axes_cur = 0;
+    for (int i = 0; i < slice_dims.size(); ++i) {
+      while (none_axes_cur < none_axes.size() &&
+             none_axes[none_axes_cur] <= i) {
+        slice_dims_with_none.push_back(1);
+        none_axes_cur++;
+      }
+      if (decrease_axes_cur < decrease_axes.size() &&
+          decrease_axes[decrease_axes_cur] == i) {
+        decrease_axes_cur++;
+      } else {
+        slice_dims_with_none.push_back(slice_dims[i]);
+      }
+    }
+    while (none_axes_cur < none_axes.size()) {
+      slice_dims_with_none.push_back(1);
+      none_axes_cur++;
+    }
+
+    slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
+  }
+
+  auto place = dev_ctx.GetPlace();
+  auto& eigen_place = *dev_ctx.eigen_device();
+
+  // Here copy data from input to avoid data loss at PE and Graph level.
+  // TODO(liym27): Speed up in the future version.
+  // - Q: Why don't call ShareDataWith to speed up?
+  // - A: Because it's not supported to ShareDataWith on OP's input and output
+  // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP
+  // - Q: Why don't delete Input, after all, the input and output are the same
+  // Tensor at program level?
+  // - A: If deleting Input, the graph will be complex, such as there will
+  // be two ops points to the output in graph: op1 -> output <- set_value.
+  // In this case, we have to find a way to handle the running order of
+  // set_value is what we want.
+  Copy(dev_ctx, in, place, false, out);
+
+  DenseTensor slice_tensor =
+      Empty<T>(dev_ctx, ScalarArray{slice_dims.Get(), slice_dims.size()});
+  DenseTensor pad_tensor =
+      Empty<T>(dev_ctx, ScalarArray{in_dims.Get(), in_dims.size()});
+
+  auto pad_e = EigenTensor<T, RANK>::From(pad_tensor, in_dims);
+  auto out_e = EigenTensor<T, RANK>::From(*out);
+  auto slice_e = EigenTensor<T, RANK>::From(slice_tensor, slice_dims);
+
+  // Step 1: Set the value of out at `_index` to zero
+  slice_e.device(eigen_place) = slice_e.constant(T(0));
+
+  auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+  auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, RANK>();
+
+  for (size_t i = 0; i < RANK; ++i) {
+    starts_indices[i] = 0;
+    ends_indices[i] = slice_dims[i];
+    strides_indices[i] = 1;
+  }
+  for (size_t i = 0; i < axes.size(); i++) {
+    int axis_index = axes[i];
+    starts_indices[axis_index] = starts_local[i];
+    ends_indices[axis_index] = ends_local[i];
+    strides_indices[axis_index] = steps_local[i];
+    if (starts_local[i] ==
+        ends_local[i]) {  // slice is empty, data will not be changed
+      return;
+    }
+  }
+
+  out_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+      .device(eigen_place) = slice_e;
+
+  // Step 2: Set a tensor with the same shape as out tensor. And its data at
+  // '_index' is the same as value, and data out of '_index' to zero
+
+  // - Step 2.1 Set slice tensor with value
+
+  // NOTE(liym27): [ Why resize slice_tensor here? ]
+  // A: When do broadcasting on slice_tensor and value, the shape of
+  // slice_tensor should be decreased dims.
+  // e.g.
+  //  x[:,0] = value
+  // x's shape = [3, 4], value's shape = [3]
+  // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
+  // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
+  // shape is [3, 3], which cross the border;
+  // If do broadcasting on Tensor with shape [3] and [3], the result's shape
+  // is [3], which is right.
+
+  slice_tensor.Resize(slice_dims_for_assign);
+  CheckIsDimsMatch(slice_dims_for_assign, value.dims());
+  // ElementwiseComputeEx can do broadcasting
+  funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
+      dev_ctx,
+      slice_tensor,
+      value,
+      -1,
+      funcs::SubtractFunctor<T>(),
+      &slice_tensor);
+
+  slice_tensor.Resize(slice_dims);
+
+  // - Step 2.2 Pad slice tensor with 0
+  pad_e.device(eigen_place) = pad_e.constant(T(0));
+  pad_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+      .device(eigen_place) = slice_e;
+
+  // Step 3: Set out tensor with value
+  out_e.device(eigen_place) = out_e - pad_e;
+}
+
+template <typename T, typename Context>
+void SetTensorValueKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& value,
+                          const ScalarArray& starts,
+                          const ScalarArray& ends,
+                          const ScalarArray& steps,
+                          const std::vector<int64_t>& axes,
+                          const std::vector<int64_t>& decrease_axes,
+                          const std::vector<int64_t>& none_axes,
+                          DenseTensor* out) {
+  const int rank = x.dims().size();
+
+  switch (rank) {
+    case 1:
+      SetValueImpl<T, Context, 1>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 2:
+      SetValueImpl<T, Context, 2>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 3:
+      SetValueImpl<T, Context, 3>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 4:
+      SetValueImpl<T, Context, 4>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 5:
+      SetValueImpl<T, Context, 5>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    case 6:
+      SetValueImpl<T, Context, 6>(dev_ctx,
+                                  x,
+                                  value,
+                                  starts,
+                                  ends,
+                                  steps,
+                                  axes,
+                                  decrease_axes,
+                                  none_axes,
+                                  out);
+      break;
+    default:
+      PADDLE_THROW(errors::InvalidArgument(
+          "The rank of input should be less than 7, but received %d.", rank));
+  }
+}
+
+template <typename T, typename Context>
+void SetValueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const ScalarArray& starts,
+                    const ScalarArray& ends,
+                    const ScalarArray& steps,
+                    const std::vector<int64_t>& axes,
+                    const std::vector<int64_t>& decrease_axes,
+                    const std::vector<int64_t>& none_axes,
+                    const std::vector<int64_t>& shape,
+                    const std::vector<Scalar>& values,
+                    DenseTensor* out) {
+  std::vector<T> assgin_values;
+  assgin_values.reserve(values.size());
+  for (const auto& val : values) {
+    assgin_values.push_back(val.to<T>());
+  }
+  DenseTensor value_tensor = Empty<T>(dev_ctx, shape);
+  paddle::framework::TensorFromVector(assgin_values, dev_ctx, &value_tensor);
+  value_tensor.Resize(phi::make_ddim(shape));
+
+  SetTensorValueKernel<T, Context>(dev_ctx,
+                                   x,
+                                   value_tensor,
+                                   starts,
+                                   ends,
+                                   steps,
+                                   axes,
+                                   decrease_axes,
+                                   none_axes,
+                                   out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/shape_kernel_impl.h b/paddle/phi/kernels/impl/shape_kernel_impl.h
new file mode 100644
index 00000000000000..982cfb33f6b14f
--- /dev/null
+++ b/paddle/phi/kernels/impl/shape_kernel_impl.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out) {
+  auto in_var = &input;
+  phi::DDim in_dims;
+  in_dims = in_var->dims();
+  auto out_t = out;
+  out_t->Resize({in_dims.size()});
+  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = in_dims[i];
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
new file mode 100644
index 00000000000000..9b1e4b1d3a65d5
--- /dev/null
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/triangular_solve_grad_kernel.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/triangular_solve_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/tril_triu_op.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out,
+                               const DenseTensor& dout,
+                               bool upper,
+                               bool transpose,
+                               bool unitriangular,
+                               DenseTensor* dx,
+                               DenseTensor* dy) {
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
+      funcs::MatrixGetBroadcastDims(x, y);
+
+  ScalarArray y_bst_dims_array(y_bst_dims_vec);
+  DenseTensor dy_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims_array);
+  if (dy) {
+    // calculate x's conjugate for complex
+    DenseTensor x_conj;
+    x_conj.Resize(x.dims());
+
+    phi::funcs::ForRange<Context> x_for_range(dev_ctx, x.numel());
+    phi::funcs::ConjFunctor<T> x_functor(
+        x.data<T>(), x.numel(), dev_ctx.template Alloc<T>(&x_conj));
+    x_for_range(x_functor);
+
+    // reuse forward to get dy_bst, and the result has been broadcated already.
+    TriangularSolveKernel<T, Context>(
+        dev_ctx, x_conj, dout, upper, !transpose, unitriangular, &dy_bst);
+
+    dy->Resize(y.dims());
+    dev_ctx.template Alloc<T>(dy);
+    if (dy_bst.dims() == y.dims()) {
+      Copy<Context>(dev_ctx, dy_bst, dev_ctx.GetPlace(), false, dy);
+    } else {
+      funcs::MatrixReduceSumFunctor<T, Context> functor;
+      functor(dev_ctx, dy_bst, dy);
+      dy->Resize(y.dims());
+    }
+  }
+
+  ScalarArray x_bst_dims_array(x_bst_dims_vec);
+  DenseTensor dx_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims_array);
+  if (dx) {
+    // calculate x's conjugate for complex
+    DenseTensor out_conj;
+    out_conj.Resize(out.dims());
+
+    phi::funcs::ForRange<Context> out_for_range(dev_ctx, out.numel());
+    phi::funcs::ConjFunctor<T> out_functor(
+        out.data<T>(), out.numel(), dev_ctx.template Alloc<T>(&out_conj));
+    out_for_range(out_functor);
+
+    auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+    if (transpose) {
+      auto mat_dim_a =
+          phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false);
+      auto mat_dim_b =
+          phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true);
+      blas.MatMul(out_conj,
+                  mat_dim_a,
+                  dy_bst,
+                  mat_dim_b,
+                  static_cast<T>(-1),
+                  &dx_bst,
+                  static_cast<T>(0));
+    } else {
+      auto mat_dim_a =
+          phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false);
+      auto mat_dim_b =
+          phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true);
+      blas.MatMul(dy_bst,
+                  mat_dim_a,
+                  out_conj,
+                  mat_dim_b,
+                  static_cast<T>(-1),
+                  &dx_bst,
+                  static_cast<T>(0));
+    }
+
+    // get upper or lower triangular
+    DenseTensor dx_bst_upper =
+        phi::Empty<T, Context>(dev_ctx, x_bst_dims_array);
+
+    const auto& dims = dx_bst.dims();
+    const auto H = dims[dims.size() - 2];
+    const auto W = dims[dims.size() - 1];
+    phi::funcs::ForRange<Context> x_for_range(dev_ctx, dx_bst.numel());
+    paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+        dx_bst.data<T>(), unitriangular, !upper, H, W, dx_bst_upper.data<T>());
+    x_for_range(tril_triu_functor);
+
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+    if (dx_bst.dims() == x.dims()) {
+      Copy<Context>(dev_ctx, dx_bst_upper, dev_ctx.GetPlace(), false, dx);
+    } else {
+      funcs::MatrixReduceSumFunctor<T, Context> functor;
+      functor(dev_ctx, dx_bst_upper, dx);
+      dx->Resize(x.dims());
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
new file mode 100644
index 00000000000000..26c2f978005f29
--- /dev/null
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/is_empty_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IsEmptyKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   DenseTensor* out) {
+  // Note: is_empty is always executed on CPU and the output data should
+  // always be allocated for CPUPlace. We reigister CUDA kernel for this op to
+  // avoid the unnecessary data transform.
+  bool* out_data = dev_ctx.template HostAlloc<bool>(out);
+  out_data[0] = phi::product(x.dims()) == 0;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(is_empty,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsEmptyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(is_empty,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsEmptyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+#endif
diff --git a/paddle/infrt/kernel/phi/allocator_kernels.h b/paddle/phi/kernels/is_empty_kernel.h
similarity index 79%
rename from paddle/infrt/kernel/phi/allocator_kernels.h
rename to paddle/phi/kernels/is_empty_kernel.h
index d10382f5e6014c..3bcf6f9054ed50 100644
--- a/paddle/infrt/kernel/phi/allocator_kernels.h
+++ b/paddle/phi/kernels/is_empty_kernel.h
@@ -14,15 +14,11 @@
 
 #pragma once
 
-#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 
-namespace infrt {
-namespace kernel {
 namespace phi {
 
-backends::CpuPhiAllocator CreateCpuAllocator();
+template <typename T, typename Context>
+void IsEmptyKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
 
 }  // namespace phi
-}  // namespace kernel
-}  // namespace infrt
diff --git a/paddle/fluid/operators/matrix_power_op.cu b/paddle/phi/kernels/isfinite_kernel.h
similarity index 54%
rename from paddle/fluid/operators/matrix_power_op.cu
rename to paddle/phi/kernels/isfinite_kernel.h
index d972e9499dc884..e695a8e0742235 100644
--- a/paddle/fluid/operators/matrix_power_op.cu
+++ b/paddle/phi/kernels/isfinite_kernel.h
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/matrix_power_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
-namespace plf = paddle::platform;
+#include "paddle/phi/core/dense_tensor.h"
 
-REGISTER_OP_CUDA_KERNEL(matrix_power,
-                        ops::MatrixPowerKernel<plf::CUDADeviceContext, float>,
-                        ops::MatrixPowerKernel<plf::CUDADeviceContext, double>);
+namespace phi {
 
-REGISTER_OP_CUDA_KERNEL(
-    matrix_power_grad,
-    ops::MatrixPowerGradKernel<plf::CUDADeviceContext, float>,
-    ops::MatrixPowerGradKernel<plf::CUDADeviceContext, double>);
+#define DEFINE_ISFINITE_KERNEL(isfinite_kernel) \
+  template <typename T, typename Context>       \
+  void isfinite_kernel(                         \
+      const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+DEFINE_ISFINITE_KERNEL(IsinfKernel)
+DEFINE_ISFINITE_KERNEL(IsnanKernel)
+DEFINE_ISFINITE_KERNEL(IsfiniteKernel)
+#undef DEFINE_ISFINITE_KERNEL
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/linspace_kernel.h b/paddle/phi/kernels/linspace_kernel.h
new file mode 100644
index 00000000000000..ca2b940aef965b
--- /dev/null
+++ b/paddle/phi/kernels/linspace_kernel.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LinspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    DataType dtype,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index 8b17d8bd2506c0..a5d3f51e5447fa 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -208,6 +208,7 @@ PD_REGISTER_KERNEL(divide,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    complex64,
                    complex128) {}
 PD_REGISTER_KERNEL(multiply,
diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h
index 342393d79bd4d3..7569cbcff087d7 100644
--- a/paddle/phi/kernels/math_kernel.h
+++ b/paddle/phi/kernels/math_kernel.h
@@ -109,7 +109,7 @@ template <typename T, typename Context>
 DenseTensor Add(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -120,7 +120,7 @@ template <typename T, typename Context>
 DenseTensor Subtract(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -131,7 +131,7 @@ template <typename T, typename Context>
 DenseTensor Divide(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -142,7 +142,7 @@ template <typename T, typename Context>
 DenseTensor Multiply(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   ElementwiseInferMeta(x, y, &meta_out);
   MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
@@ -154,9 +154,9 @@ DenseTensor Mean(const Context& dev_ctx,
                  const DenseTensor& x,
                  const std::vector<int64_t>& axis,
                  bool keep_dim) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
-  ReduceInferMetaBase(x, axis, keep_dim, false, x.dtype(), &meta_out);
+  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
   MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
   return dense_out;
 }
@@ -167,7 +167,7 @@ DenseTensor Sum(const Context& dev_ctx,
                 const std::vector<int64_t>& axis,
                 DataType dtype,
                 bool keep_dim) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
   SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
diff --git a/paddle/phi/kernels/matmul_kernel.h b/paddle/phi/kernels/matmul_kernel.h
index 8fc060d2e3dbcb..b524b9e5863dcb 100644
--- a/paddle/phi/kernels/matmul_kernel.h
+++ b/paddle/phi/kernels/matmul_kernel.h
@@ -33,9 +33,9 @@ template <typename T, typename Context>
 DenseTensor Matmul(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y,
-                   bool transpose_x,
-                   bool transpose_y) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+                   bool transpose_x = false,
+                   bool transpose_y = false) {
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   MatmulInferMeta(x, y, transpose_x, transpose_y, &meta_out);
   MatmulKernel<T, Context>(dev_ctx, x, y, transpose_x, transpose_y, &dense_out);
diff --git a/paddle/phi/kernels/matrix_power_grad_kernel.h b/paddle/phi/kernels/matrix_power_grad_kernel.h
new file mode 100644
index 00000000000000..4f70cf6e34d491
--- /dev/null
+++ b/paddle/phi/kernels/matrix_power_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixPowerGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           int n,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/matrix_power_kernel.h b/paddle/phi/kernels/matrix_power_kernel.h
new file mode 100644
index 00000000000000..39a1bc85e3fe77
--- /dev/null
+++ b/paddle/phi/kernels/matrix_power_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatrixPowerKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       int n,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/maxout_grad_kernel.h b/paddle/phi/kernels/maxout_grad_kernel.h
new file mode 100644
index 00000000000000..1ee4e8cc89676f
--- /dev/null
+++ b/paddle/phi/kernels/maxout_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out,
+                      const DenseTensor& out_grad,
+                      int groups,
+                      int axis,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/maxout_kernel.h b/paddle/phi/kernels/maxout_kernel.h
new file mode 100644
index 00000000000000..e582575678d4d1
--- /dev/null
+++ b/paddle/phi/kernels/maxout_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxOutKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int groups,
+                  int axis,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multi_dot_grad_kernel.h b/paddle/phi/kernels/multi_dot_grad_kernel.h
new file mode 100644
index 00000000000000..e6d8ecd744e12a
--- /dev/null
+++ b/paddle/phi/kernels/multi_dot_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiDotGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        const std::vector<const DenseTensor*>& x,
+                        std::vector<DenseTensor*> x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multi_dot_kernel.h b/paddle/phi/kernels/multi_dot_kernel.h
new file mode 100644
index 00000000000000..09866e8dde5eaa
--- /dev/null
+++ b/paddle/phi/kernels/multi_dot_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiDotKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& x,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/nll_loss_grad_kernel.h b/paddle/phi/kernels/nll_loss_grad_kernel.h
new file mode 100644
index 00000000000000..127dc2f961f101
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& label,
+                       const DenseTensor& total_weight,
+                       paddle::optional<const DenseTensor&> weight,
+                       const DenseTensor& d_out,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/nll_loss_kernel.cc b/paddle/phi/kernels/nll_loss_kernel.cc
new file mode 100644
index 00000000000000..b271f0f4d06a05
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_kernel.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+
+namespace phi {
+template <typename T, typename Context>
+void NllLossKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   const DenseTensor& label,
+                   paddle::optional<const DenseTensor&> weight,
+                   int64_t ignore_index,
+                   const std::string& reduction,
+                   DenseTensor* out) {
+  DenseTensor total_weight;
+  total_weight.set_meta(
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(), {1}));
+  dev_ctx.template Alloc<T>(total_weight);
+  NllLossRawKernel(dev_ctx,
+                   input,
+                   label,
+                   weight,
+                   ignore_index,
+                   reduction,
+                   out,
+                   &total_weight);
+}
+}  // namespace phi
+
+// TODO(xiongkun): add the non-raw kernel register here.
diff --git a/paddle/phi/kernels/nll_loss_kernel.h b/paddle/phi/kernels/nll_loss_kernel.h
new file mode 100644
index 00000000000000..90083e1d6840d3
--- /dev/null
+++ b/paddle/phi/kernels/nll_loss_kernel.h
@@ -0,0 +1,33 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const DenseTensor& label,
+                      paddle::optional<const DenseTensor&> weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 19427551fb3f0e..632ad00f6d06ed 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -22,6 +22,7 @@
 #endif
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+// #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/put_along_axis_grad_kernel.h b/paddle/phi/kernels/put_along_axis_grad_kernel.h
new file mode 100644
index 00000000000000..2141443da7ab17
--- /dev/null
+++ b/paddle/phi/kernels/put_along_axis_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& index,
+                            const DenseTensor& out_grad,
+                            int axis,
+                            const std::string& reduce,
+                            DenseTensor* x_grad,
+                            DenseTensor* value_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/put_along_axis_kernel.h b/paddle/phi/kernels/put_along_axis_kernel.h
new file mode 100644
index 00000000000000..797d0e364b48d4
--- /dev/null
+++ b/paddle/phi/kernels/put_along_axis_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PutAlongAxisKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& index,
+                        const DenseTensor& value,
+                        int axis,
+                        const std::string& reduce,
+                        DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc
new file mode 100644
index 00000000000000..de172a12d72884
--- /dev/null
+++ b/paddle/phi/kernels/reduce_max_kernel.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_max_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(
+    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/reduce_max_kernel.h
new file mode 100644
index 00000000000000..7560473d43c718
--- /dev/null
+++ b/paddle/phi/kernels/reduce_max_kernel.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/reshape_kernel.h b/paddle/phi/kernels/reshape_kernel.h
index 1a3d0db8a8a3b8..848f162a2a881d 100644
--- a/paddle/phi/kernels/reshape_kernel.h
+++ b/paddle/phi/kernels/reshape_kernel.h
@@ -38,7 +38,7 @@ template <typename T, typename Context>
 DenseTensor Reshape(const Context& dev_ctx,
                     const DenseTensor& x,
                     const std::vector<int64_t>& shape) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   InferMetaFromVecValue(x, shape, &meta_out);
   ReshapeKernel<Context>(dev_ctx, x, ScalarArray(shape), &dense_out);
diff --git a/paddle/phi/kernels/scale_kernel.h b/paddle/phi/kernels/scale_kernel.h
index b395834369851e..7537dc1130b83e 100644
--- a/paddle/phi/kernels/scale_kernel.h
+++ b/paddle/phi/kernels/scale_kernel.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 namespace phi {
@@ -29,21 +28,13 @@ void ScaleKernel(const Context& dev_ctx,
                  bool bias_after_scale,
                  DenseTensor* out);
 
-template <typename T, typename Context>
-void ScaleSR(const Context& dev_ctx,
-             const SelectedRows& x,
-             const Scalar& scale,
-             float bias,
-             bool bias_after_scale,
-             SelectedRows* out);
-
 template <typename T, typename Context>
 DenseTensor Scale(const Context& dev_ctx,
                   const DenseTensor& x,
                   const Scalar& scale,
                   float bias,
                   bool bias_after_scale) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   ScaleKernel<T, Context>(
diff --git a/paddle/phi/kernels/segment_pool_grad_kernel.h b/paddle/phi/kernels/segment_pool_grad_kernel.h
new file mode 100644
index 00000000000000..e773eed16e8c83
--- /dev/null
+++ b/paddle/phi/kernels/segment_pool_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& segment_ids,
+                           const DenseTensor& out,
+                           paddle::optional<const DenseTensor&> summed_ids,
+                           const DenseTensor& out_grad,
+                           const std::string& pooltype,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/segment_pool_kernel.h b/paddle/phi/kernels/segment_pool_kernel.h
new file mode 100644
index 00000000000000..8f7b30c2e8603d
--- /dev/null
+++ b/paddle/phi/kernels/segment_pool_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SegmentPoolKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& segment_ids,
+                       const std::string& pooltype,
+                       DenseTensor* out,
+                       DenseTensor* summed_ids);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index 02231867fdd35c..39fd009cd6586b 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -12,34 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/selected_rows/full_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 
 namespace phi {
+namespace sr {
 
 template <typename T, typename Context>
-void FullSR(const Context& dev_ctx,
-            const ScalarArray& shape,
-            const Scalar& val,
-            DataType dtype,
-            SelectedRows* out) {
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DataType dtype,
+                SelectedRows* out) {
   phi::FullKernel<T>(dev_ctx, shape, val, dtype, out->mutable_value());
 }
 
+}  // namespace sr
 }  // namespace phi
 
 PD_REGISTER_KERNEL(full_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::FullSR,
+                   phi::sr::FullKernel,
                    float,
                    double,
                    uint8_t,
@@ -56,7 +59,7 @@ PD_REGISTER_KERNEL(full_sr,
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::FullSR,
+                   phi::sr::FullKernel,
                    float,
                    double,
                    uint8_t,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.h b/paddle/phi/kernels/selected_rows/full_kernel.h
new file mode 100644
index 00000000000000..d84ddcc0d3f63b
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/full_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DataType dtype,
+                SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
new file mode 100644
index 00000000000000..a507cdd0d866c3
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteSRImpl(const Context& dev_ctx,
+                           const SelectedRows& x,
+                           SelectedRows* out) {
+  dev_ctx.template Alloc<T>(out);
+  Functor functor;
+  functor(x.value(), out->mutable_value());
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(isinf_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsinfSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsnanSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(isinf_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsinfSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isnan_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsnanSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(isfinite_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IsfiniteSR,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.h b/paddle/phi/kernels/selected_rows/isfinite_kernel.h
new file mode 100644
index 00000000000000..948d8c89477a25
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+#define DEFINE_ISFINITE_SR(isfinite_sr)   \
+  template <typename T, typename Context> \
+  void isfinite_sr(                       \
+      const Context& ctx, const SelectedRows& x, SelectedRows* out);
+
+DEFINE_ISFINITE_SR(IsinfSR)
+DEFINE_ISFINITE_SR(IsnanSR)
+DEFINE_ISFINITE_SR(IsfiniteSR)
+#undef DEFINE_ISFINITE_SR
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h
new file mode 100644
index 00000000000000..c53abdf996c477
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/isfinite_functor.h"
+#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context, typename Functor>
+inline void IsfiniteSRImpl(const Context& ctx,
+                           const SelectedRows& x,
+                           SelectedRows* out);
+
+#define DEFINE_ISFINITE_SR(isfinite_sr, functor)                      \
+  template <typename T, typename Context>                             \
+  void isfinite_sr(                                                   \
+      const Context& ctx, const SelectedRows& x, SelectedRows* out) { \
+    IsfiniteSRImpl<T, Context, functor>(ctx, x, out);                 \
+  }
+
+DEFINE_ISFINITE_SR(IsinfSR, funcs::InfinityV2Functor)
+DEFINE_ISFINITE_SR(IsnanSR, funcs::NANV2Functor)
+DEFINE_ISFINITE_SR(IsfiniteSR, funcs::IsfiniteV2Functor)
+#undef DEFINE_ISFINITE_SR
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index 094b6f4d12022b..38a0cb75101b7e 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -12,21 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/scale_kernel.h"
+#include "paddle/phi/kernels/selected_rows/scale_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 namespace phi {
+namespace sr {
 
 template <typename T, typename Context>
-void ScaleSR(const Context& dev_ctx,
-             const SelectedRows& x,
-             const Scalar& scale,
-             float bias,
-             bool bias_after_scale,
-             SelectedRows* out) {
+void ScaleKernel(const Context& dev_ctx,
+                 const SelectedRows& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 SelectedRows* out) {
   if (x.value().Holder() != out->value().Holder() ||
       x.value().data() != out->value().data()) {
     out->set_rows(x.rows());
@@ -36,12 +38,13 @@ void ScaleSR(const Context& dev_ctx,
       dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value());
 }
 
+}  // namespace sr
 }  // namespace phi
 
 PD_REGISTER_KERNEL(scale_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ScaleSR,
+                   phi::sr::ScaleKernel,
                    float,
                    double,
                    phi::dtype::bfloat16,
@@ -55,7 +58,7 @@ PD_REGISTER_KERNEL(scale_sr,
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::ScaleSR,
+                   phi::sr::ScaleKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.h b/paddle/phi/kernels/selected_rows/scale_kernel.h
new file mode 100644
index 00000000000000..85c2c4ddff0333
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ScaleKernel(const Context& dev_ctx,
+                 const SelectedRows& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
new file mode 100644
index 00000000000000..9bcd5d8544e2d7
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/selected_rows/shape_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const SelectedRows& input,
+                 DenseTensor* out) {
+  auto in_var = input;
+  phi::DDim in_dims;
+  in_dims = in_var.value().dims();
+  auto out_t = out;
+  out_t->Resize({in_dims.size()});
+  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = in_dims[i];
+  }
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_KERNEL(shape_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(shape_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+#endif
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cu b/paddle/phi/kernels/selected_rows/shape_kernel.h
similarity index 60%
rename from paddle/fluid/operators/optimizers/adadelta_op.cu
rename to paddle/phi/kernels/selected_rows/shape_kernel.h
index 562a157f063b44..86ba52982b5967 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cu
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,9 +11,18 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/optimizers/adadelta_op.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const SelectedRows& input,
+                 DenseTensor* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
index 881180b71b151a..b3dd1d1b7d2a02 100644
--- a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
@@ -12,22 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/uniform_random_kernel.h"
+#include "paddle/phi/kernels/selected_rows/uniform_random_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/uniform_random_kernel.h"
 
 namespace phi {
+namespace sr {
 
 template <typename T, typename Context>
-void UniformRandomRawSRKernel(const Context& dev_ctx,
-                              const ScalarArray& shape,
-                              DataType dtype,
-                              float min,
-                              float max,
-                              int seed,
-                              int diag_num,
-                              int diag_step,
-                              float diag_val,
-                              SelectedRows* out) {
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            SelectedRows* out) {
   phi::UniformRandomRawKernel<T>(dev_ctx,
                                  shape,
                                  dtype,
@@ -41,23 +46,24 @@ void UniformRandomRawSRKernel(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void UniformRandomSRKernel(const Context& dev_ctx,
-                           const ScalarArray& shape,
-                           DataType dtype,
-                           float min,
-                           float max,
-                           int seed,
-                           SelectedRows* out) {
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         SelectedRows* out) {
   phi::UniformRandomKernel<T>(
       dev_ctx, shape, dtype, min, max, seed, out->mutable_value());
 }
 
+}  // namespace sr
 }  // namespace phi
 
 PD_REGISTER_KERNEL(uniform_random_raw_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomRawSRKernel,
+                   phi::sr::UniformRandomRawKernel,
                    float,
                    double,
                    phi::dtype::bfloat16) {}
@@ -65,7 +71,7 @@ PD_REGISTER_KERNEL(uniform_random_raw_sr,
 PD_REGISTER_KERNEL(uniform_random_sr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomSRKernel,
+                   phi::sr::UniformRandomKernel,
                    float,
                    double,
                    phi::dtype::bfloat16) {}
@@ -75,14 +81,14 @@ PD_REGISTER_KERNEL(uniform_random_sr,
 PD_REGISTER_KERNEL(uniform_random_raw_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomRawSRKernel,
+                   phi::sr::UniformRandomRawKernel,
                    float,
                    double) {}
 
 PD_REGISTER_KERNEL(uniform_random_sr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::UniformRandomSRKernel,
+                   phi::sr::UniformRandomKernel,
                    float,
                    double) {}
 #endif
diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.h b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h
new file mode 100644
index 00000000000000..aee7a4c7aaf62d
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            SelectedRows* out);
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/set_value_kernel.h b/paddle/phi/kernels/set_value_kernel.h
new file mode 100644
index 00000000000000..271691b1a3596f
--- /dev/null
+++ b/paddle/phi/kernels/set_value_kernel.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SetTensorValueKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& value,
+                          const ScalarArray& starts,
+                          const ScalarArray& ends,
+                          const ScalarArray& steps,
+                          const std::vector<int64_t>& axes,
+                          const std::vector<int64_t>& decrease_axes,
+                          const std::vector<int64_t>& none_axes,
+                          DenseTensor* out);
+
+template <typename T, typename Context>
+void SetValueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const ScalarArray& starts,
+                    const ScalarArray& ends,
+                    const ScalarArray& steps,
+                    const std::vector<int64_t>& axes,
+                    const std::vector<int64_t>& decrease_axes,
+                    const std::vector<int64_t>& none_axes,
+                    const std::vector<int64_t>& shape,
+                    const std::vector<Scalar>& values,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/infrt/kernel/phi/allocator_kernels.cc b/paddle/phi/kernels/shape_kernel.h
similarity index 75%
rename from paddle/infrt/kernel/phi/allocator_kernels.cc
rename to paddle/phi/kernels/shape_kernel.h
index eba12e688b4ae2..444c481812e88d 100644
--- a/paddle/infrt/kernel/phi/allocator_kernels.cc
+++ b/paddle/phi/kernels/shape_kernel.h
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/kernel/phi/allocator_kernels.h"
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
 
-namespace infrt {
-namespace kernel {
 namespace phi {
 
-backends::CpuPhiAllocator CreateCpuAllocator() { return {}; }
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out);
 
 }  // namespace phi
-}  // namespace kernel
-}  // namespace infrt
diff --git a/paddle/phi/kernels/sign_kernel.h b/paddle/phi/kernels/sign_kernel.h
index 7ee1145012dbd9..4b5900d90f45da 100644
--- a/paddle/phi/kernels/sign_kernel.h
+++ b/paddle/phi/kernels/sign_kernel.h
@@ -25,7 +25,7 @@ void SignKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 template <typename T, typename Context>
 DenseTensor Sign(const Context& dev_ctx, const DenseTensor& x) {
-  auto dense_out = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   UnchangedInferMeta(x, &meta_out);
   SignKernel<T, Context>(dev_ctx, x, &dense_out);
diff --git a/paddle/phi/kernels/sparse/CMakeLists.txt b/paddle/phi/kernels/sparse/CMakeLists.txt
index a319e9a13c3f7e..eaea6d952167c1 100644
--- a/paddle/phi/kernels/sparse/CMakeLists.txt
+++ b/paddle/phi/kernels/sparse/CMakeLists.txt
@@ -1,3 +1,3 @@
 
-set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function)
+set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel)
 register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel")
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
index 1a6ac852448a5f..f4265d303d7307 100644
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -43,8 +45,11 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                     const std::vector<int>& dilations,
                                     const std::vector<int>& strides,
                                     const int groups) {
-  DenseTensor x_grad = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor kernel_grad = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor x_grad =
+      phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
+  DenseTensor kernel_grad = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(kernel.dtype(), {1}, kernel.layout()));
+  // TODO(zhangkaihuo): call InferMeta func here
   Conv3dGradKernel<T, Context>(dev_ctx,
                                x,
                                rulebook,
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index 71160a6365dc77..cfb451afdcbcb0 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -136,8 +137,10 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
                        const std::vector<int>& strides,
                        const int groups,
                        DenseTensor* rulebook) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensor values =
+      phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
   SparseCooTensor coo(indices, values, x.dims());
   Conv3dKernel<T, Context>(
       dev_ctx, x, kernel, paddings, dilations, strides, groups, &coo, rulebook);
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index 1031f76917920a..bcb6db407883ff 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -45,9 +45,6 @@ void ProductRuleBook(const Context& dev_ctx,
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const int* indices_ptr = non_zero_indices.data<int>();
-  dev_ctx.Alloc(counter_per_kernel,
-                counter_per_kernel->dtype(),
-                sizeof(int) * counter_per_kernel->numel());
   int* counter_ptr = counter_per_kernel->data<int>();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   memset(counter_ptr, 0, kernel_size * sizeof(int));
@@ -138,8 +135,6 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
       x.dtype(), {out_non_zero_num, out_channels}, x.layout());
   phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
   phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
-  dev_ctx.Alloc(
-      &out_indices, out_indices.dtype(), out_indices.numel() * sizeof(int));
   int* out_indices_ptr = out_indices.data<int>();
   int i = 0;
   for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) {
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index d4f770ce8713aa..6ee265a329673a 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
 namespace phi {
@@ -60,20 +61,14 @@ void Conv3dGradKernel(const Context& dev_ctx,
   phi::DenseTensor out_grad_features =
       phi::Empty(dev_ctx, std::move(out_grad_features_meta));
 
-  dev_ctx.Alloc(
-      &in_features, in_features.dtype(), sizeof(T) * in_features.numel());
   T* in_features_ptr = in_features.data<T>();
-  dev_ctx.Alloc(
-      &d_x_features, d_x_features.dtype(), sizeof(T) * d_x_features.numel());
   T* d_x_features_ptr = d_x_features.data<T>();
-  dev_ctx.Alloc(&out_grad_features,
-                out_grad_features.dtype(),
-                sizeof(T) * out_grad_features.numel());
   T* out_grad_features_ptr = out_grad_features.data<T>();
   kernel_grad->Resize(kernel_dims);
   dev_ctx.Alloc(
       kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
   T* d_kernel_ptr = kernel_grad->data<T>();
+  memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel());
 
   Gather<T>(x.non_zero_elements().data<T>(),
             rulebook_ptr + rulebook_len,
@@ -155,12 +150,11 @@ void Conv3dGradKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sparse_conv_grad,
+PD_REGISTER_KERNEL(sparse_conv3d_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::Conv3dGradKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-  kernel->InputAt(3).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index 93397d4c931000..64ef068e03ab53 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -81,8 +81,6 @@ void Conv3dKernel(const Context& dev_ctx,
       phi::Empty(dev_ctx, std::move(in_features_meta));
   phi::DenseTensor out_features =
       phi::Empty(dev_ctx, std::move(out_features_meta));
-  dev_ctx.Alloc(&in_features, x.dtype(), sizeof(T) * in_features.numel());
-  dev_ctx.Alloc(&out_features, x.dtype(), sizeof(T) * out_features.numel());
   T* in_features_ptr = in_features.data<T>();
   T* out_features_ptr = out_features.data<T>();
 
@@ -128,9 +126,6 @@ void Conv3dKernel(const Context& dev_ctx,
   }
 
   // 4. scatter
-  dev_ctx.Alloc(out->mutable_non_zero_elements(),
-                out->mutable_non_zero_elements()->dtype(),
-                sizeof(T) * in_features.numel());
   T* out_values_ptr = out->mutable_non_zero_elements()->data<T>();
   memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels);
   Scatter<T>(out_features_ptr,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
new file mode 100644
index 00000000000000..03a6aaa68943d7
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <thrust/execution_policy.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
+// this kernel with phi::GatherCUDAKernel;
+// Vectorization can be used to improve read and write bandwidth
+/**
+ * brief: gather data from params according to indices
+ * params: the inputs
+ * indices: the indices you want to gather
+ * output: the outputs
+ * index_size: the size of indices
+ * slice_size: slice size corresponding to each index, here is the channel size
+**/
+template <typename T, typename IndexT = int>
+__global__ void GatherKernel(const T* params,
+                             const IndexT* indices,
+                             T* output,
+                             size_t index_size,
+                             size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
+    IndexT gather_i = indices[indices_i];
+    int64_t params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+
+/**
+ * brief: scatter add
+ * input: the inputs
+ * unique_value: refer to UpdateIndexKernel notes
+ * out_index: the output feature index
+ * non_zero_num: the number of output features
+ * rulebook_len: the length of rulebook
+ * channels: the output channel size
+ * out: the outputs
+**/
+template <typename T>
+__global__ void ScatterKernel(const T* input,
+                              const int* unique_value,
+                              const int* out_index,
+                              const int non_zero_num,
+                              const int rulebook_len,
+                              const int channels,
+                              T* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
+    int indices_i = i / channels;
+    int channels_i = i - indices_i * channels;
+
+    int start = unique_value[indices_i];
+    int end = indices_i == non_zero_num - 1 ? rulebook_len
+                                            : unique_value[indices_i + 1];
+    // max(end-start) = kernel_size
+    T sum = static_cast<T>(0);
+    for (int j = start; j < end; j++) {
+      const int out_feature_i = out_index[j];
+      sum += input[out_feature_i * channels + channels_i];
+    }
+    out[indices_i * channels + channels_i] = sum;
+  }
+}
+
+template <typename Context>
+inline int* SortedAndUniqueIndex(const Context& dev_ctx,
+                                 const int* rulebook_ptr,
+                                 const int len,
+                                 DenseTensor* out_index,
+                                 DenseTensor* unique_key,
+                                 DenseTensor* unique_value) {
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, out_index, kps::IdentityFunctor<int>());
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, unique_value, kps::IdentityFunctor<int>());
+
+  phi::backends::gpu::GpuMemcpyAsync(unique_key->data<int>(),
+                                     rulebook_ptr,
+                                     sizeof(int) * len,
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToDevice,
+#else
+                                     cudaMemcpyDeviceToDevice,
+#endif
+                                     dev_ctx.stream());
+// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
+// performance, but thrust::merge_by_key limited by data size
+#ifdef PADDLE_WITH_HIP
+  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                      unique_key->data<int>(),
+                      unique_key->data<int>() + len,
+                      out_index->data<int>());
+
+  // 4. unique
+  thrust::pair<int*, int*> new_end =
+#ifdef PADDLE_WITH_HIP
+      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                            unique_key->data<int>(),
+                            unique_key->data<int>() + len,
+                            unique_value->data<int>());
+  return new_end.first;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
new file mode 100644
index 00000000000000..861f18f36e632c
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -0,0 +1,217 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+
+namespace phi {
+namespace sparse {
+
+// rulebook[3, rulebook_len]:
+//[
+//  [kernel_index],
+//  [in_i],
+//  [out_i],
+//]
+// x_grad = out_grad * transpose(kenrel)
+// kernel_grad = transpose(x) * out_grad
+template <typename T, typename Context>
+void Conv3dGradKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& kernel,
+                      const SparseCooTensor& out_grad,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      const int groups,
+                      DenseTensor* x_grad,
+                      DenseTensor* kernel_grad) {
+  const auto& kernel_dims = kernel.dims();
+  const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  const int* rulebook_ptr = rulebook.data<int>();
+
+  const int rulebook_len = rulebook.dims()[1];
+
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta d_x_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_grad_features_meta(
+      x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor d_x_features =
+      phi::Empty(dev_ctx, std::move(d_x_features_meta));
+  phi::DenseTensor out_grad_features =
+      phi::Empty(dev_ctx, std::move(out_grad_features_meta));
+
+  T* in_features_ptr = in_features.data<T>();
+  T* d_x_features_ptr = d_x_features.data<T>();
+  T* out_grad_features_ptr = out_grad_features.data<T>();
+  kernel_grad->Resize(kernel_dims);
+  dev_ctx.Alloc(
+      kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
+  T* d_kernel_ptr = kernel_grad->data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook_ptr + rulebook_len,
+                                             in_features_ptr,
+                                             rulebook_len,
+                                             in_channels);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * out_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(
+      out_grad.non_zero_elements().data<T>(),
+      rulebook_ptr + rulebook_len * 2,
+      out_grad_features_ptr,
+      rulebook_len,
+      out_channels);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
+      h_counter(rulebook_len, 0);
+  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
+                                     rulebook_ptr,
+                                     rulebook_len * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToHost,
+#else
+                                     cudaMemcpyDeviceToHost,
+#endif
+
+                                     dev_ctx.stream());
+  dev_ctx.Wait();
+
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[h_counter[i]] += 1;
+  }
+  int offset = 0;
+  for (int i = 0; i < kernel_size; i++) {
+    offsets[i] = offset;
+    offset += counter[i];
+  }
+  offsets[kernel_size] = offset;
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0) {
+      continue;
+    }
+
+    const int M = counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels;
+    T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels;
+    T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels;
+
+    // call gemm: d_kernel = transpose(x) * out_grad
+    // (in_channels, n) * (n, out_channels)
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_out_grad_ptr,
+              static_cast<T>(0),
+              tmp_d_kernel_ptr);
+
+    // call gemm: d_x = out_grad * transpose(kernel)
+    // (n, out_channels) * (out_channels, in_channels)
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              M,
+              K,
+              N,
+              static_cast<T>(1),
+              tmp_out_grad_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_d_x_ptr);
+  }
+
+  // 4. scatter
+  x_grad->Resize(x.non_zero_elements().dims());
+  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
+  T* x_grad_values_ptr = x_grad->data<T>();
+
+  DenseTensor out_index = phi::Empty(
+      dev_ctx,
+      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+  DenseTensor unique_key = phi::Empty(
+      dev_ctx,
+      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+  DenseTensor unique_value = phi::Empty(
+      dev_ctx,
+      DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+
+  SortedAndUniqueIndex(dev_ctx,
+                       rulebook_ptr + rulebook_len,
+                       rulebook_len,
+                       &out_index,
+                       &unique_key,
+                       &unique_value);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, rulebook_len * in_channels, 1);
+
+  ScatterKernel<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(d_x_features_ptr,
+                                         unique_value.data<int>(),
+                                         out_index.data<int>(),
+                                         x.nnz(),
+                                         rulebook_len,
+                                         in_channels,
+                                         x_grad_values_ptr);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_conv3d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index aeb9409c417ba9..4a533d9d1d5e8f 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
-#include "glog/logging.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
@@ -28,19 +27,11 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
 namespace phi {
 namespace sparse {
 
-// TODO(zhangkaihuo) replace this kernel with KP::InitWithDataIndex
-__global__ void InitByIndexKernel(const int n, int* out1, int* out2) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
-    out1[i] = i;
-    out2[i] = i;
-  }
-}
-
 /**
  * @brief: update the out index and indices
  * unique_keys: save the index of the output feature list
@@ -124,7 +115,7 @@ __global__ void ProductRuleBookKernel(const int* x_indices,
           int in_z = x_indices[i + non_zero_num];
           int in_y = x_indices[i + 2 * non_zero_num];
           int in_x = x_indices[i + 3 * non_zero_num];
-          int in_i = -1, out_index = -1;
+          int in_i = -1, out_index = -1, kernel_i = -1;
           if (Check(x_dims,
                     kernel_dims,
                     paddings,
@@ -143,9 +134,11 @@ __global__ void ProductRuleBookKernel(const int* x_indices,
             out_index =
                 PointToIndex<Dims4D>(batch, out_x, out_y, out_z, out_dims);
             atomicAdd(&counter_buf[kernel_index], 1);
+            kernel_i = kernel_index;
           }
-          rulebook[kernel_index * non_zero_num + i] = in_i;
-          rulebook[kernel_index * non_zero_num + offset + i] = out_index;
+          rulebook[kernel_index * non_zero_num + i] = kernel_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
           ++kernel_index;
         }
       }
@@ -157,68 +150,6 @@ __global__ void ProductRuleBookKernel(const int* x_indices,
   }
 }
 
-// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
-// this kernel with phi::GatherCUDAKernel;
-// Vectorization can be used to improve read and write bandwidth
-/**
- * brief: gather data from params according to indices
- * params: the inputs
- * indices: the indices you want to gather
- * output: the outputs
- * index_size: the size of indices
- * slice_size: slice size corresponding to each index, here is the channel size
-**/
-template <typename T, typename IndexT = int>
-__global__ void GatherKernel(const T* params,
-                             const IndexT* indices,
-                             T* output,
-                             size_t index_size,
-                             size_t slice_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT gather_i = indices[indices_i];
-    int64_t params_i = gather_i * slice_size + slice_i;
-    *(output + i) = *(params + params_i);
-  }
-}
-
-/**
- * brief: scatter add
- * input: the inputs
- * unique_value: refer to UpdateIndexKernel notes
- * out_index: the output feature index
- * non_zero_num: the number of output features
- * rulebook_len: the length of rulebook
- * channels: the output channel size
- * out: the outputs
-**/
-template <typename T>
-__global__ void ScatterKernel(const T* input,
-                              const int* unique_value,
-                              const int* out_index,
-                              const int non_zero_num,
-                              const int rulebook_len,
-                              const int channels,
-                              T* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
-    int indices_i = i / channels;
-    int channels_i = i - indices_i * channels;
-
-    int start = unique_value[indices_i];
-    int end = indices_i == non_zero_num - 1 ? rulebook_len
-                                            : unique_value[indices_i + 1];
-    // max(end-start) = kernel_size
-    T sum = static_cast<T>(0);
-    for (int j = start; j < end; j++) {
-      const int out_feature_i = out_index[j];
-      sum += input[out_feature_i * channels + channels_i];
-    }
-    out[indices_i * channels + channels_i] = sum;
-  }
-}
-
 // brief: calculation the distance between start and end
 __global__ void DistanceKernel(const int* start,
                                const int* end,
@@ -264,16 +195,12 @@ int ProductRuleBook(const Context& dev_ctx,
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const int* indices_ptr = non_zero_indices.data<int>();
-  dev_ctx.Alloc(counter_per_kernel,
-                counter_per_kernel->dtype(),
-                sizeof(int) * counter_per_kernel->numel());
   int* counter_ptr = counter_per_kernel->data<int>();
-  dev_ctx.Alloc(offsets_per_kernel,
-                offsets_per_kernel->dtype(),
-                sizeof(int) * offsets_per_kernel->numel());
   int* offsets_ptr = offsets_per_kernel->data<int>();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
-  rulebook->ResizeAndAllocate({2, kernel_size * non_zero_num});
+  const int rulebook_rows = 3;
+  const int rulebook_cols = kernel_size * non_zero_num;
+  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
   dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel());
   int* rulebook_ptr = rulebook->data<int>();
 
@@ -312,7 +239,7 @@ int ProductRuleBook(const Context& dev_ctx,
   int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
                              rulebook_ptr,
-                             rulebook_ptr + 2 * kernel_size * non_zero_num,
+                             rulebook_ptr + rulebook_rows * rulebook_cols,
                              -1);
 
 #ifdef PADDLE_WITH_HIP
@@ -350,6 +277,7 @@ int ProductRuleBook(const Context& dev_ctx,
   dev_ctx.Wait();
   int rulebook_len =
       (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1];
+  rulebook->Resize({rulebook_rows, rulebook_len});
 
   // 3. sorted or merge the out index
   out_index->ResizeAndAllocate({rulebook_len});
@@ -365,66 +293,30 @@ int ProductRuleBook(const Context& dev_ctx,
       unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel());
   int* unique_key_ptr = unique_key->data<int>();
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
-  InitByIndexKernel<<<config.block_per_grid.x,
-                      config.thread_per_block.x,
-                      0,
-                      dev_ctx.stream()>>>(
-      rulebook_len, out_index_ptr, unique_value_ptr);
-
-#ifdef PADDLE_WITH_HIP
-  phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
-                                     rulebook_ptr + rulebook_len,
-                                     rulebook_len * sizeof(int),
-                                     hipMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
-#else
-  phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
-                                     rulebook_ptr + rulebook_len,
-                                     rulebook_len * sizeof(int),
-                                     cudaMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
-#endif
-
-// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
-// performance, but thrust::merge_by_key limited by data size
-#ifdef PADDLE_WITH_HIP
-  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                      unique_key_ptr,
-                      unique_key_ptr + rulebook_len,
-                      out_index_ptr);
-
-  // 4. unique
-  thrust::pair<int*, int*> new_end =
-#ifdef PADDLE_WITH_HIP
-      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
-#else
-      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                            unique_key_ptr,
-                            unique_key_ptr + rulebook_len,
-                            unique_value_ptr);
+  int* new_end = SortedAndUniqueIndex(dev_ctx,
+                                      rulebook_ptr + 2 * rulebook_len,
+                                      rulebook_len,
+                                      out_index,
+                                      unique_key,
+                                      unique_value);
   // thrust::distance doesn't support stream parameters
   // const int out_non_zero_num = thrust::distance(unique_key_ptr,
   // new_end.first);
   DistanceKernel<<<1, 1>>>(unique_key_ptr,
-                           new_end.first,
-                           rulebook_ptr + 2 * kernel_size * non_zero_num - 1);
+                           new_end,
+                           rulebook_ptr + rulebook_rows * rulebook_cols - 1);
   int out_non_zero_num = 0;
 #ifdef PADDLE_WITH_HIP
   phi::backends::gpu::GpuMemcpyAsync(
       &out_non_zero_num,
-      rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
       sizeof(int),
       hipMemcpyDeviceToHost,
       dev_ctx.stream());
 #else
   phi::backends::gpu::GpuMemcpyAsync(
       &out_non_zero_num,
-      rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
       sizeof(int),
       cudaMemcpyDeviceToHost,
       dev_ctx.stream());
@@ -440,8 +332,6 @@ int ProductRuleBook(const Context& dev_ctx,
   phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
   phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
 
-  dev_ctx.Alloc(
-      &out_indices, out_indices.dtype(), sizeof(int) * out_indices.numel());
   int* out_indices_ptr = out_indices.data<int>();
 
   config =
@@ -456,7 +346,7 @@ int ProductRuleBook(const Context& dev_ctx,
                                           rulebook_len,
                                           d_out_dims,
                                           out_indices_ptr,
-                                          rulebook_ptr + rulebook_len);
+                                          rulebook_ptr + 2 * rulebook_len);
   out->SetMember(out_indices, out_values, out_dims, true);
   return rulebook_len;
 }
@@ -499,9 +389,12 @@ void Conv3dKernel(const Context& dev_ctx,
       DataType::INT32, {kernel_size}, DataLayout::NCHW);
   DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
   DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
-  DenseTensor out_index = phi::Empty<int, Context>(dev_ctx);
-  DenseTensor unique_key = phi::Empty<int, Context>(dev_ctx);
-  DenseTensor unique_value = phi::Empty<int, Context>(dev_ctx);
+  DenseTensor out_index = phi::Empty(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensor unique_key = phi::Empty(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensor unique_value = phi::Empty(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
 
   int n = ProductRuleBook<T, Context>(dev_ctx,
                                       x,
@@ -522,6 +415,7 @@ void Conv3dKernel(const Context& dev_ctx,
 
   const int* counter_ptr = counter_per_kernel.data<int>();
   const int* offsets_ptr = counter_per_kernel.data<int>();
+  const int* rulebook_ptr = rulebook->data<int>();
 
   // 2. gather
   DenseTensorMeta in_features_meta(
@@ -532,11 +426,7 @@ void Conv3dKernel(const Context& dev_ctx,
       phi::Empty(dev_ctx, std::move(in_features_meta));
   phi::DenseTensor out_features =
       phi::Empty(dev_ctx, std::move(out_features_meta));
-  dev_ctx.Alloc(
-      &in_features, in_features.dtype(), sizeof(T) * in_features.numel());
   T* in_features_ptr = in_features.data<T>();
-  dev_ctx.Alloc(
-      &out_features, out_features.dtype(), sizeof(T) * out_features.numel());
   T* out_features_ptr = out_features.data<T>();
 
   auto config =
@@ -545,7 +435,7 @@ void Conv3dKernel(const Context& dev_ctx,
                          config.thread_per_block.x,
                          0,
                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                             rulebook->data<int>(),
+                                             rulebook_ptr + n,
                                              in_features_ptr,
                                              n,
                                              in_channels);
@@ -553,8 +443,6 @@ void Conv3dKernel(const Context& dev_ctx,
   // 3. call gemm for every werght
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
   auto* out_values = out->mutable_non_zero_elements();
-  dev_ctx.Alloc(
-      out_values, out_values->dtype(), sizeof(T) * out_values->numel());
   T* out_values_ptr = out_values->data<T>();
 
   const T* kernel_ptr = kernel.data<T>();
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index d96d134a26b08a..c83b2130ed4550 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
@@ -63,8 +64,8 @@ template <typename T, typename Context>
 SparseCooTensor DenseToSparseCoo(const Context& dev_ctx,
                                  const DenseTensor& x,
                                  const int64_t sparse_dim) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   DenseToSparseCooKernel<T, Context>(dev_ctx, x, sparse_dim, &coo);
   return coo;
@@ -78,8 +79,8 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 SparseCooTensor SparseCsrToCoo(const Context& dev_ctx,
                                const SparseCsrTensor& x) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   SparseCsrToCooKernel<T, Context>(dev_ctx, x, &coo);
   return coo;
@@ -93,9 +94,9 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 SparseCsrTensor SparseCooToCsr(const Context& dev_ctx,
                                const SparseCooTensor& x) {
-  DenseTensor non_zero_crows = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_cols = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_elements = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor non_zero_crows;
+  DenseTensor non_zero_cols;
+  DenseTensor non_zero_elements;
   SparseCsrTensor csr(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
   SparseCooToCsrKernel<T, Context>(dev_ctx, x, &csr);
@@ -113,8 +114,8 @@ void DenseToSparseCsrKernel(const Context& dev_ctx,
                     phi::errors::InvalidArgument(
                         "SparseCsrTensor only support 2-D or 3-D Tensor."));
   const int64_t sparse_dim = x_dims.size() == 2 ? 2 : 3;
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   DenseToSparseCooKernel<T, Context>(dev_ctx, x, sparse_dim, &coo);
   SparseCooToCsrKernel<T, Context>(dev_ctx, coo, out);
@@ -122,9 +123,9 @@ void DenseToSparseCsrKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 SparseCsrTensor DenseToSparseCsr(const Context& dev_ctx, const DenseTensor& x) {
-  DenseTensor non_zero_crows = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_cols = phi::Empty<int64_t, Context>(dev_ctx);
-  DenseTensor non_zero_elements = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor non_zero_crows;
+  DenseTensor non_zero_cols;
+  DenseTensor non_zero_elements;
   SparseCsrTensor csr(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
   DenseToSparseCsrKernel<T, Context>(dev_ctx, x, &csr);
@@ -148,8 +149,8 @@ template <typename T, typename Context>
 void SparseCsrToDenseKernel(const Context& dev_ctx,
                             const SparseCsrTensor& x,
                             DenseTensor* out) {
-  DenseTensor indices = phi::Empty<T, Context>(dev_ctx);
-  DenseTensor values = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor indices;
+  DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
   SparseCsrToCooKernel<T, Context>(dev_ctx, x, &coo);
   SparseCooToDenseKernel<T, Context>(dev_ctx, coo, out);
diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h
index 840fe4366ce7ea..e42b25e60c4229 100644
--- a/paddle/phi/kernels/split_kernel.h
+++ b/paddle/phi/kernels/split_kernel.h
@@ -50,7 +50,7 @@ std::vector<DenseTensor> Split(const Context& dev_ctx,
   result.reserve(out_number);
 
   for (size_t i = 0; i < out_number; ++i) {
-    result.emplace_back(phi::Empty<T, Context>(dev_ctx));
+    result.emplace_back(DenseTensor());
     out_meta.emplace_back(&result.back());
     out_meta_ptr.push_back(&out_meta.back());
   }
diff --git a/paddle/phi/kernels/take_along_axis_grad_kernel.h b/paddle/phi/kernels/take_along_axis_grad_kernel.h
new file mode 100644
index 00000000000000..a312c235f66fc6
--- /dev/null
+++ b/paddle/phi/kernels/take_along_axis_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& index,
+                             const DenseTensor& out_grad,
+                             int axis,
+                             DenseTensor* x_grad);
+
+}  // namespace  phi
diff --git a/paddle/phi/kernels/take_along_axis_kernel.h b/paddle/phi/kernels/take_along_axis_kernel.h
new file mode 100644
index 00000000000000..e8fb78556d9bbe
--- /dev/null
+++ b/paddle/phi/kernels/take_along_axis_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TakeAlongAxisKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& index,
+                         int axis,
+                         DenseTensor* out);
+
+}  // namespace  phi
diff --git a/paddle/phi/kernels/top_k_grad_kernel.h b/paddle/phi/kernels/top_k_grad_kernel.h
new file mode 100644
index 00000000000000..f577b982c575dc
--- /dev/null
+++ b/paddle/phi/kernels/top_k_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TopkGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    int k,
+                    int axis,
+                    bool largest,
+                    bool sorted,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/top_k_kernel.h b/paddle/phi/kernels/top_k_kernel.h
new file mode 100644
index 00000000000000..fea76e448b5438
--- /dev/null
+++ b/paddle/phi/kernels/top_k_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TopkKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& k_scalar,
+                int axis,
+                bool largest,
+                bool sorted,
+                DenseTensor* out,
+                DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h
index 3d89b324bab5b0..b8d7fbaa2757d7 100644
--- a/paddle/phi/kernels/transpose_kernel.h
+++ b/paddle/phi/kernels/transpose_kernel.h
@@ -32,7 +32,7 @@ template <typename T, typename Context>
 DenseTensor Transpose(const Context& dev_ctx,
                       const DenseTensor& x,
                       const std::vector<int>& axis) {
-  auto dense_out = Empty<T, Context>(dev_ctx);
+  DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   TransposeInferMeta(x, axis, &meta_out);
   TransposeKernel<T, Context>(dev_ctx, x, axis, &dense_out);
diff --git a/paddle/phi/kernels/triangular_solve_grad_kernel.h b/paddle/phi/kernels/triangular_solve_grad_kernel.h
new file mode 100644
index 00000000000000..eb5a5ab461a1dc
--- /dev/null
+++ b/paddle/phi/kernels/triangular_solve_grad_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out,
+                               const DenseTensor& dout,
+                               bool upper,
+                               bool transpose,
+                               bool unitriangular,
+                               DenseTensor* dx,
+                               DenseTensor* dy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/triangular_solve_kernel.h b/paddle/phi/kernels/triangular_solve_kernel.h
new file mode 100644
index 00000000000000..833de3f8439ee8
--- /dev/null
+++ b/paddle/phi/kernels/triangular_solve_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TriangularSolveKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           bool upper,
+                           bool transpose,
+                           bool unitriangular,
+                           DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
index 0370cc431fef9c..f8547ced41934a 100644
--- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h
+++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h
@@ -20,6 +20,7 @@
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace phi {
 
@@ -157,8 +158,8 @@ struct TruncatedNormal {
 };
 
 template <typename T, typename Context>
-void TruncatedGaussianRandomKernel(const Context& ctx,
-                                   const ScalarArray& shape,
+void TruncatedGaussianRandomKernel(const Context& dev_ctx,
+                                   const std::vector<int>& shape,
                                    float mean,
                                    float std,
                                    int seed,
diff --git a/paddle/phi/kernels/uniform_random_kernel.h b/paddle/phi/kernels/uniform_random_kernel.h
index 5bba127278541e..36ce4c3f9eef58 100644
--- a/paddle/phi/kernels/uniform_random_kernel.h
+++ b/paddle/phi/kernels/uniform_random_kernel.h
@@ -17,7 +17,6 @@
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
-#include "paddle/phi/core/selected_rows.h"
 
 namespace phi {
 
@@ -42,25 +41,4 @@ void UniformRandomKernel(const Context& dev_ctx,
                          int seed,
                          DenseTensor* out);
 
-template <typename T, typename Context>
-void UniformRandomRawSRKernel(const Context& dev_ctx,
-                              const ScalarArray& shape,
-                              DataType dtype,
-                              float min,
-                              float max,
-                              int seed,
-                              int diag_num,
-                              int diag_step,
-                              float diag_val,
-                              SelectedRows* out);
-
-template <typename T, typename Context>
-void UniformRandomSRKernel(const Context& dev_ctx,
-                           const ScalarArray& shape,
-                           DataType dtype,
-                           float min,
-                           float max,
-                           int seed,
-                           SelectedRows* out);
-
 }  // namespace phi
diff --git a/paddle/phi/kernels/viterbi_decode_kernel.h b/paddle/phi/kernels/viterbi_decode_kernel.h
new file mode 100644
index 00000000000000..27eb94d89cec4a
--- /dev/null
+++ b/paddle/phi/kernels/viterbi_decode_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ViterbiDecodeKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& transition,
+                         const DenseTensor& length,
+                         bool include_bos_eos_tag,
+                         DenseTensor* scores,
+                         DenseTensor* path);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/where_index_kernel.h b/paddle/phi/kernels/where_index_kernel.h
new file mode 100644
index 00000000000000..68b094637c8d55
--- /dev/null
+++ b/paddle/phi/kernels/where_index_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/xpu/where_index_kernel.cc b/paddle/phi/kernels/xpu/where_index_kernel.cc
new file mode 100644
index 00000000000000..f6653e57f6eadf
--- /dev/null
+++ b/paddle/phi/kernels/xpu/where_index_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_index_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereIndexKernel(const Context& dev_ctx,
+                      const DenseTensor& condition,
+                      DenseTensor* out) {
+  const T* cond_data = condition.data<T>();
+  auto numel = condition.numel();
+  auto dims = condition.dims();
+  const int rank = dims.size();
+
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  int* true_num = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
+  int true_num_cpu;
+  int ret = xpu::nonzero_count(dev_ctx.x_context(), cond_data, true_num, numel);
+  PADDLE_ENFORCE_EQ(
+      ret,
+      XPU_SUCCESS,
+      phi::errors::External(
+          "XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex",
+          ret,
+          XPUAPIErrorMsg[ret]));
+
+  paddle::memory::Copy(phi::CPUPlace(),
+                       static_cast<void*>(&true_num_cpu),
+                       dev_ctx.GetPlace(),
+                       static_cast<void*>(true_num),
+                       sizeof(int32_t));
+
+  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
+  auto* out_data = dev_ctx.template Alloc<int64_t>(out);
+
+  if (true_num_cpu == 0) {
+    return;
+  }
+
+  auto condition_shape = phi::vectorize<int>(dims);
+  ret = xpu::where(
+      dev_ctx.x_context(), cond_data, out_data, condition_shape, true_num_cpu);
+  PADDLE_ENFORCE_EQ(ret,
+                    XPU_SUCCESS,
+                    phi::errors::External(
+                        "XPU masked_select kernel return wrong value[%d %s]",
+                        ret,
+                        XPUAPIErrorMsg[ret]));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    where_index, XPU, ALL_LAYOUT, phi::WhereIndexKernel, int, bool, float) {}
diff --git a/paddle/phi/kernels/yolo_box_kernel.h b/paddle/phi/kernels/yolo_box_kernel.h
new file mode 100644
index 00000000000000..9553d300cad2b7
--- /dev/null
+++ b/paddle/phi/kernels/yolo_box_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void YoloBoxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& img_size,
+                   const std::vector<int>& anchors,
+                   int class_num,
+                   float conf_thresh,
+                   int downsample_ratio,
+                   bool clip_bbox,
+                   float scale_x_y,
+                   bool iou_aware,
+                   float iou_aware_factor,
+                   DenseTensor* boxes,
+                   DenseTensor* scores);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
new file mode 100644
index 00000000000000..396830ca20765b
--- /dev/null
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+#define DefineActGradDepXOpArgMap(func_name, op_name)                        \
+  KernelSignature func_name##GradOpArgumentMapping(                          \
+      const ArgumentMappingContext& ctx) {                                   \
+    return KernelSignature(                                                  \
+        op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \
+  }
+
+#define DefineActGradDepOutOpArgMap(func_name, op_name)                        \
+  KernelSignature func_name##GradOpArgumentMapping(                            \
+      const ArgumentMappingContext& ctx) {                                     \
+    return KernelSignature(                                                    \
+        op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \
+  }
+
+KernelSignature ReluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"});
+}
+
+DefineActGradDepXOpArgMap(Cos, "cos");
+DefineActGradDepXOpArgMap(Tan, "tan");
+DefineActGradDepXOpArgMap(Acos, "acos");
+DefineActGradDepXOpArgMap(Sin, "sin");
+DefineActGradDepXOpArgMap(Asin, "asin");
+DefineActGradDepXOpArgMap(Atan, "atan");
+DefineActGradDepXOpArgMap(Sinh, "sinh");
+DefineActGradDepXOpArgMap(Cosh, "cosh");
+DefineActGradDepXOpArgMap(Asinh, "asinh");
+DefineActGradDepXOpArgMap(Acosh, "acosh");
+DefineActGradDepXOpArgMap(Atanh, "atanh");
+DefineActGradDepOutOpArgMap(Relu, "relu");
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(acos_grad, phi::AcosGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sin_grad, phi::SinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(asin_grad, phi::AsinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(atan_grad, phi::AtanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sinh_grad, phi::SinhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(cosh_grad, phi::CoshGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad,
+                           phi::ReluDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/argsort_sig.cc b/paddle/phi/ops/compat/argsort_sig.cc
new file mode 100644
index 00000000000000..62133a441ff126
--- /dev/null
+++ b/paddle/phi/ops/compat/argsort_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ArgsortGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("argsort_grad",
+                         {"Indices", "X", GradVarName("Out")},
+                         {"axis", "descending"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(argsort_grad, phi::ArgsortGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/bincount_sig.cc b/paddle/phi/ops/compat/bincount_sig.cc
new file mode 100644
index 00000000000000..35067c256ed495
--- /dev/null
+++ b/paddle/phi/ops/compat/bincount_sig.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BincountOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("bincount", {"X", "Weights"}, {"minlength"}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(bincount, phi::BincountOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/dist_sig.cc b/paddle/phi/ops/compat/dist_sig.cc
new file mode 100644
index 00000000000000..18a30b9b840483
--- /dev/null
+++ b/paddle/phi/ops/compat/dist_sig.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DistGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("dist_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"p"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(dist_grad, phi::DistGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/dropout_sig.cc b/paddle/phi/ops/compat/dropout_sig.cc
new file mode 100644
index 00000000000000..6bf229c98bd07f
--- /dev/null
+++ b/paddle/phi/ops/compat/dropout_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DropoutOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "dropout",
+      {"X", "Seed"},
+      {"dropout_prob", "is_test", "dropout_implementation", "seed", "fix_seed"},
+      {"Out", "Mask"});
+}
+
+KernelSignature DropoutGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("dropout_grad",
+                         {"Mask", GradVarName("Out")},
+                         {"dropout_prob", "is_test", "dropout_implementation"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(dropout, phi::DropoutOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(dropout_grad, phi::DropoutGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/eigh_sig.cc b/paddle/phi/ops/compat/eigh_sig.cc
new file mode 100644
index 00000000000000..e50a9a5a12a564
--- /dev/null
+++ b/paddle/phi/ops/compat/eigh_sig.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature EighGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("eigh_grad",
+                         {"Eigenvalues",
+                          "Eigenvectors",
+                          GradVarName("Eigenvalues"),
+                          GradVarName("Eigenvectors")},
+                         {},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(eigh_grad, phi::EighGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index 89846ea0563bb7..fc890fa3a4923a 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -106,6 +106,47 @@ KernelSignature ElementwiseSubDoubleGradOpArgumentMapping(
       "subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
 }
 
+KernelSignature ElementwiseDivGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("divide_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseDivDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("divide_double_grad",
+                         {"Y", "Out", "DX", "DDX", "DDY"},
+                         {"axis"},
+                         {GradVarName("Y"), "DOut", "DDOut"});
+}
+
+KernelSignature ElementwiseMulGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiply_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature ElementwiseMulDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiply_double_grad",
+                         {"X", "Y", "DOut", "DDX", "DDY"},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y"), "DDOut"});
+}
+
+KernelSignature ElementwiseMulTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multiply_triple_grad",
+      {"X", "Y", "DOut", "DDX", "DDY", "D_DX", "D_DY", "D_DDOut"},
+      {"axis"},
+      {"D_X", "D_Y", "D_DOut", "D_DDX", "D_DDY"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
@@ -117,6 +158,11 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad, divide_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad, multiply_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_grad_grad, multiply_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul_triple_grad, multiply_triple_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            phi::ElementwiseAddOpArgumentMapping);
@@ -136,3 +182,13 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
                            phi::ElementwiseSubGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad_grad,
                            phi::ElementwiseSubDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad,
+                           phi::ElementwiseDivGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad_grad,
+                           phi::ElementwiseDivDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad,
+                           phi::ElementwiseMulGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad,
+                           phi::ElementwiseMulDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad,
+                           phi::ElementwiseMulTripleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc
new file mode 100644
index 00000000000000..dacb8b25a89f9c
--- /dev/null
+++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GraphSendRecvGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "graph_send_recv_grad",
+      {GradVarName("Out"), "X", "Out", "Src_index", "Dst_index", "Dst_count"},
+      {"pool_type"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad,
+                           phi::GraphSendRecvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/isfinite_sig.cc b/paddle/phi/ops/compat/isfinite_sig.cc
new file mode 100644
index 00000000000000..218b4c2f962c48
--- /dev/null
+++ b/paddle/phi/ops/compat/isfinite_sig.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+PD_REGISTER_BASE_KERNEL_NAME(isinf_v2, isinf);
+PD_REGISTER_BASE_KERNEL_NAME(isnan_v2, isnan);
+PD_REGISTER_BASE_KERNEL_NAME(isfinite_v2, isfinite);
diff --git a/paddle/phi/ops/compat/matrix_power_sig.cc b/paddle/phi/ops/compat/matrix_power_sig.cc
new file mode 100644
index 00000000000000..4c9ad4e74ab460
--- /dev/null
+++ b/paddle/phi/ops/compat/matrix_power_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MatrixPowerGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("matrix_power_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"n"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(matrix_power_grad,
+                           phi::MatrixPowerGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/maxout_sig.cc b/paddle/phi/ops/compat/maxout_sig.cc
new file mode 100644
index 00000000000000..d16dd1c8617fe2
--- /dev/null
+++ b/paddle/phi/ops/compat/maxout_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MaxoutArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("maxout", {"X"}, {"groups", "axis"}, {"Out"});
+}
+
+KernelSignature MaxoutGradArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("maxout_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"groups", "axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(maxout, phi::MaxoutArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(maxout_grad, phi::MaxoutGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/multi_dot_sig.cc b/paddle/phi/ops/compat/multi_dot_sig.cc
new file mode 100644
index 00000000000000..598cbd980f3cc5
--- /dev/null
+++ b/paddle/phi/ops/compat/multi_dot_sig.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MultiDotGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multi_dot_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(multi_dot_grad, phi::MultiDotGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/mv_sig.cc b/paddle/phi/ops/compat/mv_sig.cc
index ab0d31ee31dab9..0012f8e1ccb411 100644
--- a/paddle/phi/ops/compat/mv_sig.cc
+++ b/paddle/phi/ops/compat/mv_sig.cc
@@ -16,10 +16,6 @@
 
 namespace phi {
 
-KernelSignature MvOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("mv", {"X", "Vec"}, {}, {"Out"});
-}
-
 KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("mv_grad",
                          {"X", "Vec", GradVarName("Out")},
@@ -29,5 +25,4 @@ KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PD_REGISTER_ARG_MAPPING_FN(mv, phi::MvOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(mv_grad, phi::MvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/nll_loss_sig.cc b/paddle/phi/ops/compat/nll_loss_sig.cc
new file mode 100644
index 00000000000000..f274d7f77c5c0a
--- /dev/null
+++ b/paddle/phi/ops/compat/nll_loss_sig.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature NllLossOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  // TODO(xiongkun): can't remove the forward mapping, because the Weight is
+  // optional
+  return KernelSignature("nll_loss",
+                         {"X", "Label", "Weight"},
+                         {"ignore_index", "reduction"},
+                         {"Out", "Total_weight"});
+}
+
+KernelSignature NllLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "nll_loss_grad",
+      {"X", "Label", "Total_weight", "Weight", GradVarName("Out")},
+      {"ignore_index", "reduction"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(nll_loss_grad, phi::NllLossGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(nll_loss, phi::NllLossOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/put_along_axis_sig.cc b/paddle/phi/ops/compat/put_along_axis_sig.cc
new file mode 100644
index 00000000000000..5f8dc1cf4cd711
--- /dev/null
+++ b/paddle/phi/ops/compat/put_along_axis_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PutAlongAxisArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("put_along_axis",
+                         {"Input", "Index", "Value"},
+                         {"Axis", "Reduce"},
+                         {"Result"});
+}
+
+KernelSignature PutAlongAxisGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("put_along_axis_grad",
+                         {"Input", "Index", GradVarName("Result")},
+                         {"Axis", "Reduce"},
+                         {GradVarName("Input"), GradVarName("Value")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(put_along_axis, phi::PutAlongAxisArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(put_along_axis_grad,
+                           phi::PutAlongAxisGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 92839fb3030752..36798abe4c11b8 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -21,7 +21,7 @@ KernelSignature ReduceSumOpArgumentMapping(const ArgumentMappingContext& ctx) {
     bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "sum_raw" KernelSignature.
-    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // And the InferMeta function(i.e. SumRawInferMeta) is accordance with
     // the "sum_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature("sum_raw",
@@ -40,7 +40,8 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
     bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "mean_raw" KernelSignature.
-    // And the InferMeta function(i.e. MeanRawInferMeta) is accordance with the
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the
     // "mean_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature(
@@ -56,11 +57,30 @@ KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
       "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
 }
 
+KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "max_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the
+    // "max_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("max", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/segment_pool_sig.cc b/paddle/phi/ops/compat/segment_pool_sig.cc
new file mode 100644
index 00000000000000..97646a2ac31d33
--- /dev/null
+++ b/paddle/phi/ops/compat/segment_pool_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SegmentPoolGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "segment_pool_grad",
+      {
+          "X", "SegmentIds", "Out", "SummedIds", GradVarName("Out"),
+      },
+      {"pooltype"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(segment_pool_grad,
+                           phi::SegmentPoolGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc
new file mode 100644
index 00000000000000..eacfff26d53cf1
--- /dev/null
+++ b/paddle/phi/ops/compat/set_value_sig.cc
@@ -0,0 +1,736 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("Input")) {
+    if (ctx.HasInput("StartsTensorList")) {
+      if (ctx.HasInput("EndsTensorList")) {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      } else {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"StartsTensorList",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      }
+    } else {
+      if (ctx.HasInput("EndsTensorList")) {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "EndsTensorList",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      } else {
+        if (ctx.HasInput("StepsTensorList")) {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "StepsTensorList",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        } else {
+          if (ctx.HasInput("ValueTensor")) {
+            return KernelSignature("set_value_with_tensor",
+                                   {"Input", "ValueTensor"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp32_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("fp64_values") &&
+                     !paddle::any_cast<std::vector<double>>(
+                          ctx.Attr("fp64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int32_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("int32_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int32_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("int64_values") &&
+                     !paddle::any_cast<std::vector<int64_t>>(
+                          ctx.Attr("int64_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "int64_values"},
+                                   {"Out"});
+          } else if (ctx.HasAttr("bool_values") &&
+                     !paddle::any_cast<std::vector<int>>(
+                          ctx.Attr("bool_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "bool_values"},
+                                   {"Out"});
+          }
+        }
+      }
+    }
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(set_value, phi::SetValueOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/take_along_axis_sig.cc b/paddle/phi/ops/compat/take_along_axis_sig.cc
new file mode 100644
index 00000000000000..27a996a270ddf4
--- /dev/null
+++ b/paddle/phi/ops/compat/take_along_axis_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TakeAlongAxisArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "take_along_axis", {"Input", "Index"}, {"Axis"}, {"Result"});
+}
+
+KernelSignature TakeAlongAxisGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("take_along_axis_grad",
+                         {"Input", "Index", GradVarName("Result")},
+                         {"Axis"},
+                         {GradVarName("Input")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(take_along_axis, phi::TakeAlongAxisArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(take_along_axis_grad,
+                           phi::TakeAlongAxisGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/top_k_sig.cc b/paddle/phi/ops/compat/top_k_sig.cc
new file mode 100644
index 00000000000000..9bf922b3d1b589
--- /dev/null
+++ b/paddle/phi/ops/compat/top_k_sig.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TopkOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("K")) {
+    return KernelSignature(
+        "top_k", {"X"}, {"K", "axis", "largest", "sorted"}, {"Out", "Indices"});
+
+  } else {
+    return KernelSignature(
+        "top_k", {"X"}, {"k", "axis", "largest", "sorted"}, {"Out", "Indices"});
+  }
+}
+
+KernelSignature TopkGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("top_k_grad",
+                         {GradVarName("Out"), "X", "Indices"},
+                         {"k", "axis", "largest", "sorted"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(top_k_v2, top_k);
+PD_REGISTER_BASE_KERNEL_NAME(top_k_v2_grad, top_k_grad);
+PD_REGISTER_ARG_MAPPING_FN(top_k_v2, phi::TopkOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(top_k_v2_grad, phi::TopkGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/triangular_solve_sig.cc b/paddle/phi/ops/compat/triangular_solve_sig.cc
new file mode 100644
index 00000000000000..c56af3e21e53e9
--- /dev/null
+++ b/paddle/phi/ops/compat/triangular_solve_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TriangularSolveGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("triangular_solve_grad",
+                         {"X", "Y", "Out", GradVarName("Out")},
+                         {"upper", "transpose", "unitriangular"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(triangular_solve_grad,
+                           phi::TriangularSolveGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/yolo_box_sig.cc b/paddle/phi/ops/compat/yolo_box_sig.cc
new file mode 100644
index 00000000000000..bb39e72a64f507
--- /dev/null
+++ b/paddle/phi/ops/compat/yolo_box_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature YoloBoxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("yolo_box",
+                         {"X", "ImgSize"},
+                         {"anchors",
+                          "class_num",
+                          "conf_thresh",
+                          "downsample_ratio",
+                          "clip_bbox",
+                          "scale_x_y",
+                          "iou_aware",
+                          "iou_aware_factor"},
+                         {"Boxes", "Scores"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(yolo_box, phi::YoloBoxOpArgumentMapping);
diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h
index d93f00129b9a14..6b9bb7aecefe6f 100644
--- a/paddle/phi/tests/api/scale_api.h
+++ b/paddle/phi/tests/api/scale_api.h
@@ -20,6 +20,7 @@
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index 5356bac9fbd808..de9bd7a4d479c8 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS phi_custom_kernel)
+cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index a4e89231e14f82..6fe34a6891a35e 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -172,7 +172,9 @@ TEST(CustomKernel, custom_kernel_dot) {
               fake_dot_kernels.end());
 
   // register
-  phi::RegisterCustomKernels(phi::CustomKernelMap::Instance());
+  phi::CustomKernelMap::Instance().RegisterCustomKernels();
+
+  EXPECT_EQ(0, static_cast<int>(custom_fake_dot_kernels.size()));
 
   EXPECT_TRUE(fake_dot_kernels.find(
                   phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) !=
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index ace95b55055a1e..c1a8b853b32e38 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -78,9 +78,6 @@ void TestConv3dBase(const std::vector<int>& indices,
   DenseTensor indices_tensor = phi::Empty(
       dev_ctx_cpu,
       DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
-  dev_ctx_cpu.Alloc(&indices_tensor,
-                    indices_tensor.dtype(),
-                    sizeof(int) * indices_tensor.numel());
   memcpy(
       indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
   DenseTensor features_tensor = phi::Empty(
@@ -88,9 +85,6 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       {non_zero_num, in_channels},
                       DataLayout::NHWC));
-  dev_ctx_cpu.Alloc(&features_tensor,
-                    features_tensor.dtype(),
-                    features_tensor.numel() * sizeof(T));
   memcpy(
       features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
 
@@ -101,12 +95,18 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       kernel_dims,
                       DataLayout::NHWC));
-  dev_ctx_cpu.Alloc(
-      &kernel_tensor, kernel_tensor.dtype(), kernel_tensor.numel() * sizeof(T));
   memcpy(kernel_tensor.data<T>(), kernel.data(), kernel.size() * sizeof(T));
 
+  auto f_verify = [&](const T* real_data, const std::vector<T>& correct_data) {
+    for (uint64_t i = 0; i < correct_data.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  };
+
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    DenseTensor rulebook = phi::Empty<int, phi::CPUContext>(dev_ctx_cpu);
+    DenseTensor rulebook = phi::Empty(
+        dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
     SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
                                             x_tensor,
                                             kernel_tensor,
@@ -127,15 +127,6 @@ void TestConv3dBase(const std::vector<int>& indices,
                              correct_out_indices.size() * sizeof(int));
     ASSERT_EQ(cmp_indices, 0);
 
-    auto f_verify = [&](const T* real_data,
-                        const std::vector<T>& correct_data) {
-      for (uint64_t i = 0; i < correct_data.size(); i++) {
-        float tmp =
-            std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
-        ASSERT_LT(tmp, diff);
-      }
-    };
-
     f_verify(out.non_zero_elements().data<T>(), correct_out_features);
 
     if (backward) {
@@ -170,9 +161,6 @@ void TestConv3dBase(const std::vector<int>& indices,
   DenseTensor d_indices_tensor = phi::Empty(
       dev_ctx_gpu,
       DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
-  dev_ctx_gpu.Alloc(&d_indices_tensor,
-                    d_indices_tensor.dtype(),
-                    sizeof(int) * d_indices_tensor.numel());
   phi::Copy(
       dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
 
@@ -181,9 +169,6 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       {non_zero_num, in_channels},
                       DataLayout::NHWC));
-  dev_ctx_gpu.Alloc(&d_features_tensor,
-                    d_features_tensor.dtype(),
-                    sizeof(T) * d_features_tensor.numel());
   phi::Copy(
       dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
 
@@ -194,13 +179,11 @@ void TestConv3dBase(const std::vector<int>& indices,
       DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
                       kernel_dims,
                       DataLayout::NHWC));
-  dev_ctx_gpu.Alloc(&d_kernel_tensor,
-                    d_kernel_tensor.dtype(),
-                    sizeof(T) * d_kernel_tensor.numel());
   phi::Copy(
       dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
 
-  DenseTensor d_rulebook = phi::Empty<int, phi::GPUContext>(dev_ctx_gpu);
+  DenseTensor d_rulebook = phi::Empty(
+      dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
   SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
                                             d_x_tensor,
                                             d_kernel_tensor,
@@ -219,9 +202,6 @@ void TestConv3dBase(const std::vector<int>& indices,
   DenseTensor h_indices_tensor = phi::Empty(
       dev_ctx_cpu,
       DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
-  dev_ctx_cpu.Alloc(&h_indices_tensor,
-                    h_indices_tensor.dtype(),
-                    sizeof(int) * h_indices_tensor.numel());
   phi::Copy(dev_ctx_gpu,
             d_out.non_zero_indices(),
             phi::CPUPlace(),
@@ -239,18 +219,34 @@ void TestConv3dBase(const std::vector<int>& indices,
                       {d_out.nnz()},
                       d_out.layout()));
 
-  dev_ctx_cpu.Alloc(&h_features_tensor,
-                    h_features_tensor.dtype(),
-                    sizeof(T) * h_features_tensor.numel());
   phi::Copy(dev_ctx_gpu,
             d_out.non_zero_elements(),
             phi::CPUPlace(),
             true,
             &h_features_tensor);
-  for (uint64_t i = 0; i < correct_out_features.size(); i++) {
-    float tmp = std::fabs(static_cast<float>(correct_out_features[i] -
-                                             h_features_tensor.data<T>()[i]));
-    ASSERT_LT(tmp, diff);
+  f_verify(h_features_tensor.data<T>(), correct_out_features);
+
+  if (backward) {
+    std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_gpu,
+                                                           d_x_tensor,
+                                                           d_rulebook,
+                                                           d_kernel_tensor,
+                                                           d_out,
+                                                           paddings,
+                                                           dilations,
+                                                           strides,
+                                                           1);
+    DenseTensor h_features_grad = phi::Empty(
+        dev_ctx_cpu,
+        DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout()));
+    phi::Copy(dev_ctx_gpu, grads[0], phi::CPUPlace(), true, &h_features_grad);
+    f_verify(h_features_grad.data<T>(), features_grad);
+
+    DenseTensor h_kernel_grad = phi::Empty(
+        dev_ctx_cpu,
+        DenseTensorMeta(grads[1].dtype(), grads[1].dims(), grads[1].layout()));
+    phi::Copy(dev_ctx_gpu, grads[1], phi::CPUPlace(), true, &h_kernel_grad);
+    f_verify(h_kernel_grad.data<T>(), kernel_grad);
   }
 #endif
 }
diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
index 3e2ad0495f3ba8..b8f214b79e290c 100644
--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -90,6 +90,10 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
 
   phi::CPUContext dev_ctx_cpu;
   dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
 
   // 1. test cpu
   auto cpu_sparse_out =
@@ -300,6 +304,11 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   auto cpu_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_cpu, csr);
   CheckResult<T, int64_t>(&dev_ctx_cpu,
                           cpu_sparse_out,
@@ -473,6 +482,11 @@ void TestCooToCsr(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   auto cpu_sparse_out = sparse::SparseCooToCsr<T>(dev_ctx_cpu, coo);
   CheckCsrResult<T, int64_t>(&dev_ctx_cpu,
                              cpu_sparse_out,
@@ -563,6 +577,11 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
 
   // 1. test cpu
   auto cpu_sparse_out = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
@@ -667,6 +686,11 @@ void TestSparseCooToDense(const DDim& dense_dims,
                           const int64_t non_zero_num,
                           const int64_t sparse_dim) {
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
@@ -836,6 +860,11 @@ void TestSparseCsrToDense(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.Init();
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
   DenseTensor cpu_sparse_out = sparse::SparseCsrToDense<T>(dev_ctx_cpu, csr);
   int cmp_cpu = memcmp(cpu_sparse_out.data<T>(),
                        dense_data.data(),
diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc
index a6c9a27de7dc5a..88c9193a8f8949 100644
--- a/paddle/phi/tests/ops/test_op_signature.cc
+++ b/paddle/phi/tests/ops/test_op_signature.cc
@@ -114,5 +114,375 @@ TEST(ARG_MAP, fill_constant) {
   ASSERT_EQ(signature9.name, "full_sr");
 }
 
+TEST(ARG_MAP, set_value) {
+  TestArgumentMappingContext arg_case(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case1(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case1).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case2(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case2).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case3(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case3).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case4(
+      {"Input", "StartsTensorList", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case4).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case5(
+      {"Input", "StartsTensorList", "EndsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case5).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case6(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case6).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case7(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case7).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case8(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case8).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case9(
+      {"Input", "StartsTensorList", "EndsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case9).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case10(
+      {"Input", "StartsTensorList", "StepsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case10).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case11(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case11).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case12(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case12).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case13(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case13).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case14(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case14).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case15(
+      {"Input", "StartsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case15).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case16(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case16).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case17(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case17).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case18(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case18).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case19(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case19).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case20(
+      {"Input", "StartsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case20).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case21(
+      {"Input", "EndsTensorList", "StepsTensorList", "ValueTensor"},
+      {},
+      {},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case21).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case22(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case22).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case23(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case23).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case24(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case24).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case25(
+      {"Input", "EndsTensorList", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case25).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case26(
+      {"Input", "EndsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case26).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case27(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case27).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case28(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case28).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case29(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case29).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case30(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case30).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case31(
+      {"Input", "EndsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case31).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case32(
+      {"Input", "StepsTensorList", "ValueTensor"}, {}, {}, {"Out"}, {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case32).name,
+      "set_value_with_tensor");
+
+  TestArgumentMappingContext arg_case33(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"fp32_values", paddle::any{std::vector<float>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case33).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case34(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"fp64_values", paddle::any{std::vector<double>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case34).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case35(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"int32_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case35).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case36(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"int64_values", paddle::any{std::vector<int64_t>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case36).name,
+      "set_value");
+
+  TestArgumentMappingContext arg_case37(
+      {"Input", "StepsTensorList"},
+      {},
+      {{"bool_values", paddle::any{std::vector<int>{1}}}},
+      {"Out"},
+      {});
+  ASSERT_EQ(
+      OpUtilsMap::Instance().GetArgumentMappingFn("set_value")(arg_case37).name,
+      "set_value");
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index fb7be82d1c5a5c..0ba2dae90967c7 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -33,16 +33,17 @@ function update_pd_ops() {
    rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
    cmake .. -DWITH_PYTHON=ON -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
-   make -j8 paddle_python print_pten_kernels
+   make -j8 paddle_python print_pten_kernels kernel_signature_generator
    cd ${PADDLE_ROOT}/build
    ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json
+   ./paddle/fluid/pybind/kernel_signature_generator > ../tools/infrt/kernel_signature.json
    cd python/dist/
    python3 -m pip uninstall -y paddlepaddle
    python3 -m pip install  *whl
    # update pd_ops.td
    cd ${PADDLE_ROOT}/tools/infrt/
    python3 generate_pd_op_dialect_from_paddle_op_maker.py
-   python3 generate_phi_kernel_dialect.py ./kernels.json
+   python3 generate_phi_kernel_dialect.py
 }
 
 function init() {
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index eace7c41f4a311..0cc68bf31617c4 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -1,5 +1,5 @@
 # for paddle test case
 
 if(WITH_TESTING)
-  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags)
+  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc)
 endif()
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 9ca29d509f60e5..5132f23079f1fe 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -107,9 +107,9 @@ def decorate(models,
         import paddle
 
         model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
-        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+        optimizer = paddle.optimizer.SGD(parameters=model.parameters())
 
-        model, optimizer = paddle.amp.decorate(models=model, optimizers=optimzier, level='O2')
+        model, optimizer = paddle.amp.decorate(models=model, optimizers=optimizer, level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
@@ -122,7 +122,7 @@ def decorate(models,
         model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
         optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
 
-        models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+        models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index fc299bc7b552b7..a0ae9bc29dabe2 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -55,6 +55,7 @@
 from . import cloud_utils  # noqa: F401
 from . import utils  # noqa: F401
 
+from .sharding import *  # noqa: F401
 
 __all__ = [  # noqa
       "spawn",
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 8efb9eb719237f..56beb8957415d3 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -99,11 +99,11 @@ def prepare(self,
             all_ranks = world_process_group.ranks
             for rank in all_ranks:
                 self._parallel(rank)
-        place = _get_device()
-        if isinstance(place, fluid.CUDAPlace):
+        self._place = _get_device()
+        if isinstance(self._place, fluid.CUDAPlace):
             self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
         if self._executor is None:
-            self._executor = fluid.Executor(place)
+            self._executor = paddle.static.Executor(self._place)
 
     def _build(self):
         serial_main_prog = self._serial_main_progs.get(self.mode, None)
@@ -119,12 +119,13 @@ def _build(self):
             labels = [s._create_feed_layer() for s in to_list(labels_spec)]
             self._input_vars = inputs
             self._label_vars = labels
-            feed_list = self._input_vars + self._label_vars
+            self._feed_vars = self._input_vars + self._label_vars
             outputs = to_list(self.model(*inputs))
             if self.mode != "predict" and self.loss:
                 loss = self.loss(*(outputs + labels))
                 self._loss_var = loss
 
+        self._fetch_vars = {"outputs": outputs, "loss": loss}
         self._serial_main_progs[self.mode] = serial_main_prog
         self._serial_startup_progs[self.mode] = serial_startup_prog
         self._dist_contexts[self.mode] = DistributedContext(
@@ -278,19 +279,32 @@ def _create_dataloader(self, dataset, batch_size, epochs, steps_per_epoch):
         dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
         dist_context = self._dist_contexts[self.mode]
         dist_main_block = dist_main_prog.global_block()
+        serial_main_prog = self._serial_main_progs[self.mode]
+        serial_main_block = serial_main_prog.global_block()
         op_size = len(dist_main_block.ops)
         places = paddle.static.cuda_places()
         with fluid.program_guard(dist_main_prog, dist_startup_prog):
             dataloader = NonIterableGeneratorLoader(
                 dataset, feed_list, places, batch_size, epochs, steps_per_epoch)
         new_op_size = len(dist_main_block.ops)
-        for idx in range(new_op_size - 1, op_size - 1, -1):
+        for _ in range(new_op_size - 1, op_size - 1, -1):
             op = dist_main_block.ops[new_op_size - 1]
             new_op_desc = dist_main_block.desc._prepend_op()
             new_op_desc.copy_from(op.desc)
             new_op = Operator(
                 dist_main_block, new_op_desc, type=new_op_desc.type())
             dist_main_block.ops.insert(0, new_op)
+            for in_name in new_op.input_arg_names:
+                if in_name == "lod_tensor_blocking_queue_0":
+                    continue
+                if in_name not in dist_main_block.vars:
+                    in_var = serial_main_block._var_recursive(in_name)
+                    dist_main_block._clone_variable(in_var, in_var.persistable)
+            for out_name in new_op.output_arg_names:
+                if out_name not in dist_main_block.vars:
+                    out_var = serial_main_block._var_recursive(out_name)
+                    dist_main_block._clone_variable(out_var,
+                                                    out_var.persistable)
             dist_op = DistributedOperator(new_op)
             dist_context.add_dist_op_for_program(dist_op)
         for _ in range(new_op_size - op_size):
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 4cc710b226d8f8..c6afcfec8a0082 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -29,6 +29,7 @@
 
 # NOTE: If op in _g_special_ops, it will not be resharded. 
 _g_special_ops = ['check_finite_and_unscale', 'update_loss_scaling']
+while_block_info = {}
 
 
 class AllGatherOpDesc:
@@ -280,8 +281,20 @@ def _is_overlapped(shape_x, shape_y):
     return overlapped
 
 
-def _need_reshard(dist_tensor, dist_op, op_input=True):
+def _need_reshard(dist_tensor,
+                  dist_op,
+                  actual_process_mesh,
+                  program,
+                  dist_context,
+                  op_input=True):
     """Judge the tensor whether needs to be resharded."""
+
+    def _is_unshard(dims_mapping):
+        for dim in dims_mapping:
+            if dim != -1:
+                return False
+        return True
+
     is_reshard = False
     tensor_dist_attr = dist_tensor.dist_attr
     tensor_name = dist_tensor.serial_tensor.name
@@ -289,32 +302,74 @@ def _need_reshard(dist_tensor, dist_op, op_input=True):
     tensor_process_mesh = tensor_dist_attr.process_mesh
     op_dist_attr = dist_op.dist_attr
     op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
-    op_process_mesh = op_dist_attr.process_mesh
+    op_process_mesh = actual_process_mesh
     if op_input:
         op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
-        op_process_mesh = op_dist_attr.process_mesh
         if all(
                 map(lambda x: x is not None, [
                     tensor_dims_mapping, tensor_process_mesh,
                     op_input_dims_mapping, op_process_mesh
                 ])):
-            if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh:
-                is_reshard = True
+            # dims_mapping
+            if tensor_dims_mapping != op_input_dims_mapping:
+                if dist_op.serial_op.type == "while":
+                    sub_block = program.blocks[dist_op.serial_op.attr(
+                        "sub_block").id]
+                    for op in sub_block.ops:
+                        for var_name in op.input_arg_names:
+                            if var_name == tensor_name:
+                                dist_op_attr = dist_context.get_dist_op_for_program(
+                                    op).dist_attr
+                                var_dims_mapping = dist_op_attr.get_input_dims_mapping(
+                                    var_name)
+                                if var_dims_mapping != tensor_dims_mapping:
+                                    is_reshard = True
+                                    break
+                else:
+                    is_reshard = True
+            # process_mesh
+            if tensor_process_mesh != op_process_mesh:
+                # when processes length is not the same, the dims mapping must be replicative now
+                if len(tensor_process_mesh.processes) != len(
+                        op_process_mesh.processes):
+                    assert _is_unshard(tensor_dims_mapping)
+                    assert _is_unshard(op_input_dims_mapping)
+                else:
+                    if dist_tensor.serial_tensor.dtype == paddle.bool:
+                        raise ValueError("Bool var is not supported reshard.")
+
+                    # for while op, it should find the process mesh of op actually used the tensor as input
+                    if dist_op.serial_op.type == "while":
+                        sub_block = program.blocks[dist_op.serial_op.attr(
+                            "sub_block").id]
+                        for op in sub_block.ops:
+                            for var_name in op.input_arg_names:
+                                if var_name == tensor_name:
+                                    dist_op_attr = dist_context.get_dist_op_for_program(
+                                        op).dist_attr
+                                    process_mesh = dist_op_attr.process_mesh
+                                    if process_mesh == op_process_mesh:
+                                        is_reshard = True
+                                        break
+                    else:
+                        is_reshard = True
     else:
         op_output_dims_mapping = op_dist_attr.get_output_dims_mapping(
             tensor_name)
-        op_process_mesh = op_dist_attr.process_mesh
         if all(
                 map(lambda x: x is not None, [
                     tensor_dims_mapping, tensor_process_mesh,
                     op_output_dims_mapping, op_process_mesh
                 ])):
             if tensor_process_mesh != op_process_mesh:
+                if dist_tensor.serial_tensor.dtype == paddle.bool:
+                    raise ValueError("Bool var is not supported reshard.")
                 is_reshard = True
             if tensor_dims_mapping != op_output_dims_mapping:
                 raise ValueError(
                     "It is not supported that tensor dims mapping is different from op output dims mapping."
                 )
+
     return is_reshard
 
 
@@ -329,13 +384,14 @@ def _compute_complete_shape(slice_shape, process_shape, dims_mapping):
     return complete_shape
 
 
-def find_op_desc_seq(dist_tensor, dist_op):
+def find_op_desc_seq(dist_tensor, dist_op, actual_process_mesh, batch_size):
     """
     Find the op description sequence to reshard the source tensor for matching the op requirement.
 
     Args:
         dist_tensor (DistributedTensor): A distributed tensor.
         dist_op (DistributedOperator): A distributed operator.
+        actual_process_mesh (ProcessMesh): The actual op process mesh.
 
     Returns:
         Dict, the dict represents the required op description sequence corresponding to process, The key of dict is
@@ -350,11 +406,16 @@ def find_op_desc_seq(dist_tensor, dist_op):
     source_process_shape = source_process_mesh.topology
 
     op_dist_attr = dist_op.dist_attr
-    target_process_mesh = op_dist_attr.process_mesh
+    target_process_mesh = actual_process_mesh
     target_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
     target_process_group = target_process_mesh.processes
     target_process_shape = target_process_mesh.topology
 
+    if source_tensor.shape[0] < 0:
+        new_shape = list(source_tensor.shape)
+        new_shape[0] = batch_size
+        source_tensor.desc.set_shape(new_shape)
+
     complete_shape = _compute_complete_shape(
         source_tensor.shape, source_process_shape, source_dims_mapping)
     op_desc_seq = {}
@@ -503,7 +564,7 @@ def find_op_desc_seq(dist_tensor, dist_op):
     return op_desc_seq
 
 
-def _insert_send_op(block, idx, tensor, dst):
+def _insert_send_op(block, idx, tensor, dst, op_role):
     """Insert send op into block at the given index."""
     op_type = 'send_v2'
     block._insert_op(
@@ -514,10 +575,11 @@ def _insert_send_op(block, idx, tensor, dst):
             'ring_id': 0,
             'peer': dst,
             'use_calc_stream': True,
+            'op_role': op_role
         })
 
 
-def _insert_recv_op(block, idx, tensor, src):
+def _insert_recv_op(block, idx, tensor, src, op_role):
     """Insert recv op into block at the given index."""
     op_type = 'recv_v2'
     block._insert_op(
@@ -531,14 +593,16 @@ def _insert_recv_op(block, idx, tensor, src):
             'out_shape': tensor.shape,
             'dtype': tensor.dtype,
             'use_calc_stream': True,
+            'op_role': op_role
         })
 
 
-def _insert_concat_op(block, idx, tensors, axis):
+def _insert_concat_op(block, idx, tensors, axis, op_role):
     """Insert concat op into block at the given block."""
     inputs = {'X': tensors}
     attrs = {}
     attrs['axis'] = axis
+    attrs['op_role'] = op_role
     helper = LayerHelper('concat', **locals())
     with paddle.static.program_guard(block.program):
         out = helper.create_variable_for_type_inference(
@@ -548,7 +612,8 @@ def _insert_concat_op(block, idx, tensors, axis):
     return out
 
 
-def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name):
+def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name,
+                     op_role):
     """Insert slice op into block at the given block."""
     inputs = {'Input': tensor}
     infer_flags = list(1 for i in range(len(axes)))
@@ -556,24 +621,23 @@ def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name):
         "axes": axes,
         "starts": starts,
         "ends": ends,
-        "infer_flags": infer_flags
+        "infer_flags": infer_flags,
+        'op_role': op_role
     }
     helper = LayerHelper('slice', **locals())
     out = block.create_var(
-        name=new_var_name,
-        dtype=tensor.dtype,
-        type=core.VarDesc.VarType.LOD_TENSOR)
+        name=new_var_name, dtype=tensor.dtype, type=tensor.type)
     block._insert_op(
         idx, type="slice", inputs=inputs, outputs={'Out': [out]}, attrs=attrs)
     return out
 
 
-def _insert_split_op(block, idx, tensor, num_or_sections):
+def _insert_split_op(block, idx, tensor, num_or_sections, op_role):
     """Insert split op into block at the given index."""
     helper = LayerHelper('split', **locals())
     input_shape = tensor.shape
     inputs = {'X': tensor}
-    attrs = {'num': num_or_sections, "axis": 0}
+    attrs = {'num': num_or_sections, 'axis': 0, 'op_role': op_role}
     with paddle.static.program_guard(block.program):
         outs = [
             helper.create_variable_for_type_inference(
@@ -584,7 +648,7 @@ def _insert_split_op(block, idx, tensor, num_or_sections):
     return outs
 
 
-def _insert_allgather_op(block, idx, tensor, ranks):
+def _insert_allgather_op(block, idx, tensor, ranks, op_role):
     """Insert allgather op into block at the given index."""
 
     def _insert_fill_constant_op(block, idx):
@@ -597,6 +661,7 @@ def _insert_fill_constant_op(block, idx):
         attrs['str_value'] = str(int("1"))
         attrs['value'] = int("1")
         attrs['dtype'] = out.dtype
+        attrs['op_role'] = op_role
         utils.get_shape_tensor_inputs(
             inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant')
         block._insert_op(
@@ -625,14 +690,16 @@ def _insert_fill_constant_op(block, idx):
             inputs={'X': [fill_constant_out]},
             outputs={'Out': [fill_constant_out]},
             attrs={'ring_id': 0,
-                   'use_calc_stream': True})
+                   'use_calc_stream': True,
+                   'op_role': op_role})
 
         # insert c_sync_calc_stream op
         block._insert_op(
             idx + 2,
             type="c_sync_calc_stream",
             inputs={'X': [fill_constant_out]},
-            outputs={'Out': [fill_constant_out]})
+            outputs={'Out': [fill_constant_out]},
+            attrs={'op_role': op_role})
         idx_offset = 3
 
     # insert c_allgather op
@@ -649,20 +716,21 @@ def _insert_fill_constant_op(block, idx):
         attrs={
             'ring_id': group.id,
             'use_calc_stream': True,
-            'nranks': group.nranks
+            'nranks': group.nranks,
+            'op_role': op_role
         })
     idx_offset += 1
 
     # insert split op
     split_out = _insert_split_op(block, idx + idx_offset, allgather_out,
-                                 group.nranks)
+                                 group.nranks, op_role)
     idx_offset += 1
     tensor_list.extend(split_out)
     return tensor_list, idx_offset
 
 
 def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
-                               block, idx):
+                               block, idx, op_role):
     """Concat the tensors and insert concat op."""
     if not partition_tensor_list:
         partition_tensor_list.append((tensor, partition_index))
@@ -674,13 +742,13 @@ def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
                 partition_tensor_list[i][1], partition_index)
             if concat_axis != -1:
                 has_concat = True
-                _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis) \
+                _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis, op_role) \
                     if first_order == 0 else \
-                    _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis)
+                    _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis, op_role)
                 partition_tensor_list.pop(i)
                 idx[0] += 1
                 _concat_partitions_with_op(partition_tensor_list, _,
-                                           new_partition, block, idx)
+                                           new_partition, block, idx, op_role)
                 break
             i += 1
         if not has_concat:
@@ -692,8 +760,47 @@ def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
 HAS_ALLGATHER = {}
 
 
-def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
-                  dist_context):
+def _get_while_op_actual_process_mesh(op, program, rank_id, dist_context):
+    """Get the while op actual Process mesh corresponding to rank"""
+    assert op.type == "while"
+    while_op_process_mesh = dist_context.get_dist_op_for_program(
+        op).dist_attr.process_mesh
+    sub_block = program.blocks[op.attr("sub_block").id]
+    ops = sub_block.ops
+    actual_process_mesh = None
+    for op in ops:
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if not dist_op:
+            continue
+        process_mesh = dist_op.dist_attr.process_mesh
+        if process_mesh == while_op_process_mesh:
+            continue
+        if rank_id in process_mesh.processes:
+            raw_process_mesh = process_mesh
+            break
+
+    if actual_process_mesh is None and rank_id in while_op_process_mesh.processes:
+        actual_process_mesh = while_op_process_mesh
+
+    assert actual_process_mesh is not None
+    return actual_process_mesh
+
+
+def _get_var(var_name, block, program):
+    """Get var in the parent block if not found in the current block"""
+    var = None
+    if var_name in block.vars:
+        var = block.vars[var_name]
+    else:
+        parent_block = program.blocks[block.parent_idx]
+        if var_name in parent_block.vars:
+            var = parent_block.vars[var_name]
+    assert var is not None
+    return var
+
+
+def parse_op_desc(block, rank_id, op_desc_seq, var_name, reshard_op,
+                  dist_context, program, actual_process_mesh):
     """Parse op desc sequence and insert op in the block"""
     global HAS_SENT
     global HAS_RECV
@@ -703,9 +810,6 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
     if rank_id not in op_desc_seq.keys():
         return
     op_desc_list = op_desc_seq[rank_id]
-    block = program.global_block()
-    assert var_name in block.vars.keys(
-    ), "The {} cannot be found in the {} program.".format(var_name, rank_id)
 
     idx = None
     for index, op in list(enumerate(block.ops)):
@@ -716,7 +820,7 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
         rank_id)
 
     matched_op = block.ops[idx]
-    source_tensor = block.vars[var_name]
+    source_tensor = _get_var(var_name, block, program)
     for op_desc in op_desc_list:
         if isinstance(op_desc, AllGatherOpDesc):  # noqa: F401
             if var_name not in HAS_ALLGATHER.keys():
@@ -724,7 +828,8 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             if not HAS_ALLGATHER[var_name] or op_desc.group not in list(
                     map(lambda x: x[0], HAS_ALLGATHER[var_name])):
                 tensor_list, idx_offset = _insert_allgather_op(
-                    block, idx, source_tensor, op_desc.group)
+                    block, idx, source_tensor, op_desc.group,
+                    reshard_op.attr('op_role'))
                 idx += idx_offset
                 tensor_name_list = [var.name for var in tensor_list]
                 HAS_ALLGATHER[var_name].append(
@@ -743,7 +848,8 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             if var_name not in HAS_SENT.keys():
                 HAS_SENT[var_name] = []
             if op_desc.dst not in HAS_SENT[var_name]:
-                _insert_send_op(block, idx, source_tensor, op_desc.dst)
+                _insert_send_op(block, idx, source_tensor, op_desc.dst,
+                                reshard_op.attr('op_role'))
                 idx += 1
                 HAS_SENT[var_name].append(op_desc.dst)
 
@@ -758,8 +864,10 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                 recv_tensor = block.create_var(
                     name=unique_name.generate(var_name + "@recv"),
                     shape=shape,
-                    dtype=source_tensor.dtype)
-                _insert_recv_op(block, idx, recv_tensor, op_desc.src)
+                    dtype=source_tensor.dtype,
+                    type=source_tensor.type)
+                _insert_recv_op(block, idx, recv_tensor, op_desc.src,
+                                reshard_op.attr('op_role'))
                 tensor_list.append(recv_tensor)
                 idx += 1
                 HAS_RECV[var_name][op_desc.src] = recv_tensor
@@ -772,7 +880,7 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             for index, tensor in enumerate(tensor_list):
                 _concat_partitions_with_op(partition_tensor_list, tensor,
                                            partition_index_list[index], block,
-                                           idx_list)
+                                           idx_list, reshard_op.attr('op_role'))
             idx = idx_list[0]
 
         elif isinstance(op_desc, SliceOpDesc):
@@ -787,11 +895,11 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                 starts=op_desc.starts,
                 ends=op_desc.ends,
                 axes=op_desc.axes,
-                new_var_name=new_name)
+                new_var_name=new_name,
+                op_role=reshard_op.attr('op_role'))
 
             tensor_attr = TensorDistributedAttribute()
-            process_mesh = dist_context.get_op_dist_attr_for_program(
-                matched_op).process_mesh
+            process_mesh = actual_process_mesh
             dims_mapping = dist_context.get_op_dist_attr_for_program(
                 matched_op).get_input_dims_mapping(var_name)
             tensor_attr.dims_mapping = dims_mapping
@@ -799,11 +907,29 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
             dist_context.set_tensor_dist_attr_for_program(target_tensor,
                                                           tensor_attr)
 
+            if op.type == "while":
+                global while_block_info
+                # var_reshard_mapping means the while op input need be changed to 
+                if "var_reshard_mapping" not in while_block_info[op.attr(
+                        "sub_block").id].keys():
+                    while_block_info[op.attr("sub_block").id][
+                        "var_reshard_mapping"] = {}
+                while_block_info[op.attr("sub_block").id][
+                    "var_reshard_mapping"][var_name] = target_tensor.name
+
             # rename op input name according to new name
             for op in block.ops:
                 for name in op.input_arg_names:
                     op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
                     if name == var_name and op_dist_attr is not None:
+                        if op.desc.id() == matched_op.desc.id():
+                            op.desc._rename_input(name, target_tensor.name)
+                            op_dist_attr.set_input_dims_mapping(
+                                target_tensor.name, dims_mapping)
+                            op_dist_attr.set_input_dist_attr(name, None)
+                            continue
+
+                        # NOTE: For op whose process mesh is a union, its input will not be renamed by other op reshard result now which means that it will have more reshard operation.
                         op_process_mesh = op_dist_attr.process_mesh
                         op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(
                             var_name)
@@ -819,102 +945,166 @@ def _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
     not_remove_op_ref = [
         "create_py_reader", "create_double_buffer_reader", "read"
     ]
-    remove_op_idx = []
-    block = auto_parallel_main_prog.global_block()
-    ops = block.ops
-    vars = block.vars
-    for idx, op in enumerate(ops):
-        # handle read op in the pipeline scene specially, it will be removed in the future.
-        if op.type == "read":
-            dim_list = []
-            for var_name in op.output_arg_names:
-                dim_list.extend(vars[var_name].shape)
-            for i in range(idx, -1, -1):
-                if ops[i].type == "create_py_reader":
-                    ops[i]._set_attr("shape_concat", dim_list)
-                    break
-            continue
-
-        # replace the input and output of c_sync_comm_stream op when in pipeline scene.
-        if op.type == "c_sync_comm_stream":
-            need_save = []
-            for var_name in op.input_arg_names:
-                process_mesh = dist_context.get_tensor_dist_attr_for_program(
-                    vars[var_name]).process_mesh
-                if rank_id in process_mesh.processes:
-                    need_save.append(var_name)
-            if not need_save:
-                remove_op_idx.append(idx)
+    global while_block_info
+
+    # NOTE: The nested sub block is not be supported now.
+    remove_block_order = []
+    for block_idx in while_block_info:
+        remove_block_order.append(block_idx)
+
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        if block_idx not in remove_block_order:
+            remove_block_order.append(block_idx)
+
+    # the sub block should be removed first
+    for block_idx in remove_block_order:
+        remove_op_idx = []
+        block = auto_parallel_main_prog.blocks[block_idx]
+        ops = block.ops
+        vars = block.vars
+        for idx, op in enumerate(ops):
+            if op.type == "read":
+                dim_list = []
+                for var_name in op.output_arg_names:
+                    dim_list.extend(
+                        _get_var(var_name, block, auto_parallel_main_prog)
+                        .shape)
+                for i in range(idx, -1, -1):
+                    if ops[i].type == "create_py_reader":
+                        ops[i]._set_attr("shape_concat", dim_list)
+                        break
                 continue
 
-            proto = OpProtoHolder.instance().get_op_proto(op.type)
-            op.desc.set_input(proto.inputs[0].name, need_save)
-            op.desc.set_output(proto.outputs[0].name, need_save)
-            continue
+            # replace the input and output of c_sync_comm_stream op when in pipeline scene.
+            if op.type == "c_sync_comm_stream":
+                need_save = []
+                for var_name in op.input_arg_names:
+                    process_mesh = dist_context.get_tensor_dist_attr_for_program(
+                        _get_var(var_name, block,
+                                 auto_parallel_main_prog)).process_mesh
+                    if rank_id in process_mesh.processes:
+                        need_save.append(var_name)
+                if not need_save:
+                    remove_op_idx.append(idx)
+                    continue
 
-        # judge the other op whether should be removed.
-        op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
-        if op_dist_attr is not None:
-            op_process_mesh = op_dist_attr.process_mesh
-            if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
-                remove_op_idx.append(idx)
+                proto = OpProtoHolder.instance().get_op_proto(op.type)
+                op.desc.set_input(proto.inputs[0].name, need_save)
+                op.desc.set_output(proto.outputs[0].name, need_save)
+                continue
 
-    for idx in remove_op_idx[::-1]:
-        block._remove_op(idx)
+            # judge the other op whether should be removed.
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+            if op_dist_attr is not None:
+                op_process_mesh = op_dist_attr.process_mesh
+                if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
+                    remove_op_idx.append(idx)
+
+        for idx in remove_op_idx[::-1]:
+            block._remove_op(idx)
 
 
 def _remove_no_need_vars(auto_parallel_main_prog, dist_params_grads):
     """Remove no need vars in the main program"""
-    remove_vars = set()
-    block = auto_parallel_main_prog.global_block()
-    ops = block.ops
-    vars = block.vars
-    need_vars = set()
-    for op in ops:
-        for var_name in op.input_arg_names:
-            if var_name in vars:
-                need_vars.add(var_name)
-        for var_name in op.output_arg_names:
-            if var_name in vars:
-                need_vars.add(var_name)
-    for var in vars:
-        if var not in need_vars:
-            remove_vars.add(var)
-
-    # change dist_params_grads
-    param_grad_map = {}
-    for op in ops:
-        if int(op.attr('op_role')) == int(OpRole.Optimize):
-            if "Param" in op.input_names and "Grad" in op.input_names:
-                param_name = op.input("Param")[0]
-                grad_name = op.input("Grad")[0]
-                param_grad_map[param_name] = grad_name
-
-    need_remove_idx = []
-    for idx, item in enumerate(dist_params_grads):
-        if item[0].name not in param_grad_map.keys():
-            need_remove_idx.append(idx)
-
-    for idx in need_remove_idx[::-1]:
-        dist_params_grads.pop(idx)
-
-    idx = 0
-    while idx < len(dist_params_grads):
-        param_name = dist_params_grads[idx][0].name
-        grad_name = dist_params_grads[idx][1].name
-        if grad_name != param_grad_map[param_name]:
-            dist_params_grads[idx] = (vars[param_name],
-                                      vars[param_grad_map[param_name]])
-        idx += 1
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        remove_vars = set()
+        ops = block.ops
+        vars = block.vars
+        need_vars = set()
+        for op in ops:
+            for var_name in op.input_arg_names:
+                if var_name in vars:
+                    need_vars.add(var_name)
+            for var_name in op.output_arg_names:
+                if var_name in vars:
+                    need_vars.add(var_name)
+        for var in vars:
+            if var not in need_vars:
+                remove_vars.add(var)
+
+        # change dist_params_grads, the optimize op just in block 0.
+        if block_idx == 0:
+            param_grad_map = {}
+            for op in ops:
+                if int(op.attr('op_role')) == int(OpRole.Optimize):
+                    if "Param" in op.input_names and "Grad" in op.input_names:
+                        param_name = op.input("Param")[0]
+                        grad_name = op.input("Grad")[0]
+                        param_grad_map[param_name] = grad_name
+
+            need_remove_idx = []
+            for idx, item in enumerate(dist_params_grads):
+                if item[0].name not in param_grad_map.keys():
+                    need_remove_idx.append(idx)
+
+            for idx in need_remove_idx[::-1]:
+                dist_params_grads.pop(idx)
+
+            idx = 0
+            while idx < len(dist_params_grads):
+                param_name = dist_params_grads[idx][0].name
+                grad_name = dist_params_grads[idx][1].name
+                if grad_name != param_grad_map[param_name]:
+                    dist_params_grads[idx] = (vars[param_name],
+                                              vars[param_grad_map[param_name]])
+                idx += 1
 
-    for var in remove_vars:
-        block._remove_var(var)
+        for var in remove_vars:
+            block._remove_var(var)
+
+
+def _change_while_op_input_and_output(auto_parallel_main_prog, dist_context):
+    """Change while op input and output after the corresponding sub block ops removed"""
+    global while_block_info
+    for sub_block_idx in while_block_info:
+        sub_block = auto_parallel_main_prog.blocks[sub_block_idx]
+        parent_while_op_id = while_block_info[sub_block_idx]["op_id"]
+        parent_block = auto_parallel_main_prog.blocks[sub_block.parent_idx]
+
+        sub_block_op_inputs = set()
+        sub_block_op_outputs = []
+        for op in sub_block.ops:
+            # skip the input and output of operators inserted in the reshard phase
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op:
+                for var_name in op.output_arg_names:
+                    if var_name not in sub_block_op_outputs:
+                        sub_block_op_outputs.append(var_name)
+                for var_name in op.input_arg_names:
+                    sub_block_op_inputs.add(var_name)
+
+        # find the while op
+        while_op = None
+        for op in parent_block.ops:
+            if op.desc.id() == parent_while_op_id and op.type == "while":
+                while_op = op
+                break
+
+        assert while_op is not None
+
+        # find the actual input and output of while op
+        proto = OpProtoHolder.instance().get_op_proto(while_op.type)
+        new_X = []
+        for var_name in while_op.input("X"):
+            if var_name in sub_block_op_inputs:
+                new_X.append(var_name)
+        assert new_X
+        while_op.desc.set_input(proto.inputs[0].name, new_X)
+
+        new_Out = []
+        for var_name in while_op.output("Out"):
+            for output_name in sub_block_op_outputs[::-1]:
+                if output_name.find(var_name) != -1:
+                    new_Out.append(output_name)
+        assert new_Out
+        while_op.desc.set_output(proto.outputs[0].name, new_Out)
 
 
 def remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id,
                            dist_params_grads):
     """Remove no need vars and ops in the main program."""
     _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id)
+    _change_while_op_input_and_output(auto_parallel_main_prog, dist_context)
     _remove_no_need_vars(auto_parallel_main_prog, dist_params_grads)
 
 
@@ -992,8 +1182,70 @@ def remove_no_need_in_startup(auto_parallel_main_prog,
         startup_block._remove_op(idx)
 
 
-def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
-            dist_context, dist_params_grads):
+def _get_process_meshes(op, program, dist_context):
+    """Get all process meshes when op has sub block."""
+    assert op.has_attr("sub_block")
+    sub_block = program.blocks[op.attr("sub_block").id]
+    ops = sub_block.ops
+    op_process_mesh = dist_context.get_dist_op_for_program(
+        op).dist_attr.process_mesh
+    process_meshes = []
+    for op in ops:
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if not dist_op:
+            continue
+        process_mesh = dist_op.dist_attr.process_mesh
+        if process_mesh not in process_meshes and process_mesh != op_process_mesh:
+            process_meshes.append(process_mesh)
+
+    if not process_meshes:
+        process_meshes.append(op_process_mesh)
+
+    return process_meshes
+
+
+def _is_condition_replicative(op, program, dist_context):
+    assert op.type == "while"
+    sub_block = program.blocks[op.attr("sub_block").id]
+    dist_op = dist_context.get_dist_op_for_program(op)
+    op_dist_attr = dist_op.dist_attr
+
+    # the dims mapping of condition tensor should be replicative
+    for var_name in op.input("Condition"):
+        var = _get_var(var_name, sub_block, program)
+        dist_tensor = dist_context.get_dist_tensor_for_program(var)
+        tensor_dist_attr = dist_tensor.dist_attr
+        var_dims_mapping = tensor_dist_attr.dims_mapping
+        for dim in var_dims_mapping:
+            if dim != -1:
+                return False
+
+    return True
+
+
+def _get_op_process_meshes(op, dist_context):
+    process_meshes = []
+    dist_op = dist_context.get_dist_op_for_program(op)
+    op_process_mesh = dist_op.dist_attr.process_mesh
+    for process_mesh in dist_context.process_meshes:
+        if set(process_mesh.processes) & (
+                set(op_process_mesh.processes)
+        ) and len(process_mesh.processes) <= len(op_process_mesh.processes):
+            process_meshes.append(process_mesh)
+
+    # it means the process mesh is not a union when process meshes is null
+    if not process_meshes:
+        process_meshes.append(op_process_mesh)
+
+    return process_meshes
+
+
+def reshard(auto_parallel_main_prog,
+            auto_parallel_startup_prog,
+            rank_id,
+            dist_context,
+            dist_params_grads,
+            batch_size=None):
     """
     Reshard tensor in the program according to its distributed attribute and corresponding op distributed attribute.
 
@@ -1019,65 +1271,137 @@ def _is_special_op(op):
             return True
         return False
 
-    block = auto_parallel_main_prog.global_block()
-    idx = 0
-    while idx < len(block.ops):
-        pre_op_count = len(block.ops)
-        op = block.ops[idx]
+    global while_block_info
+    for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
+        if block_idx in while_block_info:
+            if "var_reshard_mapping" in while_block_info[block_idx]:
+                var_reshard_mapping = while_block_info[block_idx][
+                    "var_reshard_mapping"]
+                for op in block.ops:
+                    for var_name in op.input_arg_names:
+                        if var_name in var_reshard_mapping:
+                            op.desc._rename_input(var_name,
+                                                  var_reshard_mapping[var_name])
+                            dist_op = dist_context.get_dist_op_for_program(op)
+                            op_dist_attr = dist_op.dist_attr
+                            if op_dist_attr.process_mesh == while_block_info[
+                                    block_idx]["actual_process_mesh"]:
+                                dims_mapping = op_dist_attr.get_input_dims_mapping(
+                                    var_name)
+                                op_dist_attr.set_input_dims_mapping(
+                                    var_reshard_mapping[var_name], dims_mapping)
+                                op_dist_attr.set_input_dist_attr(var_name, None)
+
+                    # the outputs also need to be renamed when the output name is the same with input name
+                    for var_name in op.output_arg_names:
+                        if var_name in var_reshard_mapping:
+                            op.desc._rename_output(
+                                var_name, var_reshard_mapping[var_name])
+                            dist_op = dist_context.get_dist_op_for_program(op)
+                            op_dist_attr = dist_op.dist_attr
+                            if op_dist_attr.process_mesh == while_block_info[
+                                    block_idx]["actual_process_mesh"]:
+                                dims_mapping = op_dist_attr.get_output_dims_mapping(
+                                    var_name)
+                                op_dist_attr.set_output_dims_mapping(
+                                    var_reshard_mapping[var_name], dims_mapping)
+                                op_dist_attr.set_output_dist_attr(var_name,
+                                                                  None)
+
+        idx = 0
+        while idx < len(block.ops):
+            pre_op_count = len(block.ops)
+            op = block.ops[idx]
+
+            if _is_special_op(op):
+                idx += 1
+                continue
 
-        if _is_special_op(op):
-            idx += 1
-            continue
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op is not None:
+                process_meshes = []
+                if op.type == "while":
+                    if not _is_condition_replicative(
+                            op, auto_parallel_main_prog, dist_context):
+                        raise ValueError(
+                            "Please check the condition due to the dims mapping is not replicative."
+                        )
+                    process_meshes = _get_process_meshes(
+                        op, auto_parallel_main_prog, dist_context)
+                    assert process_meshes
+                    if op.attr("sub_block").id not in while_block_info:
+                        while_block_info[op.attr("sub_block").id] = {}
+                    while_block_info[op.attr("sub_block").id][
+                        "op_id"] = op.desc.id()
+                    while_block_info[op.attr("sub_block").id][
+                        "actual_process_mesh"] = _get_while_op_actual_process_mesh(
+                            op, auto_parallel_main_prog, rank_id, dist_context)
+                else:
+                    process_meshes = _get_op_process_meshes(op, dist_context)
+                input_vars = None
+                if op.type == "while":
+                    input_var_names = op.input("X")
+                else:
+                    input_var_names = op.input_arg_names
+                idx_offset = 0
+                for var_name in op.input_arg_names:
+                    # skip lod_tensor_blocking_queue_0
+                    if var_name == "lod_tensor_blocking_queue_0":
+                        continue
+                    var = _get_var(var_name, block, auto_parallel_main_prog)
+                    dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                    for process_mesh in process_meshes:
+                        if dist_tensor is not None and _need_reshard(
+                                dist_tensor, dist_op, process_mesh,
+                                auto_parallel_main_prog, dist_context):
+                            reshard_op_desc = find_op_desc_seq(
+                                dist_tensor, dist_op, process_mesh, batch_size)
+                            parse_op_desc(block, rank_id, reshard_op_desc,
+                                          var_name, op, dist_context,
+                                          auto_parallel_main_prog, process_mesh)
+                            cur_op_count = len(block.ops)
+                            idx_offset = idx_offset + cur_op_count - pre_op_count
+                            pre_op_count = cur_op_count
+                idx = idx + idx_offset + 1
+            else:
+                idx += 1
 
-        dist_op = dist_context.get_dist_op_for_program(op)
-        if dist_op is not None:
-            idx_offset = 0
-            for var_name in op.input_arg_names:
-                # skip lod_tensor_blocking_queue_0
-                if var_name == "lod_tensor_blocking_queue_0":
-                    continue
-                var = block.vars[var_name]
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                if dist_tensor is not None and _need_reshard(dist_tensor,
-                                                             dist_op):
-                    reshard_op_desc = find_op_desc_seq(dist_tensor, dist_op)
-                    parse_op_desc(auto_parallel_main_prog, rank_id,
-                                  reshard_op_desc, var_name, op, dist_context)
-                    cur_op_count = len(block.ops)
-                    idx_offset = idx_offset + cur_op_count - pre_op_count
-                    pre_op_count = cur_op_count
-            idx = idx + idx_offset + 1
-        else:
-            idx += 1
-
-    # insert send and recv op if output process mesh is different from tensor process mesh
-    idx = 0
-    skip_ops = ["create_py_reader", "create_double_buffer_reader", "read"]
-    skip_ops += _g_special_ops
-    while idx < len(block.ops):
-        pre_op_count = len(block.ops)
-        op = block.ops[idx]
-        dist_op = dist_context.get_dist_op_for_program(op)
-        if dist_op is not None and op.type not in skip_ops:
-            for var_name in op.output_arg_names:
-                var = block.vars[var_name]
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                if dist_tensor is not None and _need_reshard(dist_tensor,
-                                                             dist_op, False):
-                    for index, item in enumerate(
-                            dist_op.dist_attr.process_mesh.processes):
-                        recv_rank = dist_tensor.dist_attr.process_mesh.processes[
-                            index]
-                        if rank_id == item:
-                            _insert_send_op(block, idx + 1, var, recv_rank)
-                        if rank_id == recv_rank:
-                            _insert_recv_op(block, idx + 1, var, item)
-                    cur_op_count = len(block.ops)
-                    idx_offset = idx_offset + cur_op_count - pre_op_count
-                    pre_op_count = cur_op_count
-            idx = idx + idx_offset + 1
-        else:
-            idx += 1
+        # insert send and recv op if output process mesh is different from tensor process mesh
+        idx = 0
+        # skip reader and ops whose process mesh is union
+        skip_ops = [
+            "create_py_reader", "create_double_buffer_reader", "read", "while",
+            "write_to_array", "read_from_array"
+        ]
+        skip_ops += _g_special_ops
+        while idx < len(block.ops):
+            pre_op_count = len(block.ops)
+            op = block.ops[idx]
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op is not None and op.type not in skip_ops:
+                for var_name in op.output_arg_names:
+                    var = _get_var(var_name, block, auto_parallel_main_prog)
+                    dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                    process_mesh = dist_op.dist_attr.process_mesh
+                    if dist_tensor is not None and _need_reshard(
+                            dist_tensor, dist_op, process_mesh,
+                            auto_parallel_main_prog, dist_context, False):
+                        for index, item in enumerate(
+                                dist_op.dist_attr.process_mesh.processes):
+                            recv_rank = dist_tensor.dist_attr.process_mesh.processes[
+                                index]
+                            if rank_id == item:
+                                _insert_send_op(block, idx + 1, var, recv_rank,
+                                                op.attr('op_role'))
+                            if rank_id == recv_rank:
+                                _insert_recv_op(block, idx + 1, var, item,
+                                                op.attr('op_role'))
+                        cur_op_count = len(block.ops)
+                        idx_offset = idx_offset + cur_op_count - pre_op_count
+                        pre_op_count = cur_op_count
+                idx = idx + idx_offset + 1
+            else:
+                idx += 1
 
     # remove no need vars and ops in the main program
     remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id,
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 75e0ae251ef05e..241eadcbace22c 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -22,7 +22,6 @@
 from functools import reduce
 
 import paddle.fluid.core as core
-from paddle.framework.io import _to_LodTensor
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 from paddle.fluid.io import is_parameter, is_belong_to_optimizer
 from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute
@@ -739,7 +738,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
             rank_id = paddle.distributed.get_rank()
             index = cur_attr["process_group"].index(rank_id)
             param = dist_param_dict[var_name][index]
-            dist_param_dict[var_name] = _to_LodTensor(param)
+            dist_param_dict[var_name] = param
             continue
 
         pre_param = dist_param_dict[var_name]
@@ -751,7 +750,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
             dist_param_dict[var_name] = complete_param
         else:
             complete_param = pre_param[0]
-            dist_param_dict[var_name] = _to_LodTensor(complete_param)
+            dist_param_dict[var_name] = complete_param
 
         if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping:
             sliced_param = _slice_parameter_with_dist_attr(complete_param,
@@ -798,7 +797,7 @@ def _merge_parameter_with_dist_attr(param_list, dist_attr):
 
     assert len(partition_param_list) == 1 or not partition_param_list, \
         "Fail to merge parameter"
-    complete_param = _to_LodTensor(partition_param_list[0][0])
+    complete_param = partition_param_list[0][0]
     return complete_param
 
 
@@ -818,7 +817,7 @@ def _slice_parameter_with_dist_attr(param, dist_attr):
     rank_id = paddle.distributed.get_rank()
     sliced_param_index = _get_sliced_param_index(
         rank_id, param.shape, dims_mapping, process_shape, process_group)
-    sliced_param = _to_LodTensor(sliced_param_list[sliced_param_index])
+    sliced_param = sliced_param_list[sliced_param_index]
     return sliced_param
 
 
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 3731332d1e7774..bf6556d21e9fc7 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -29,7 +29,6 @@
 from ..fluid.dygraph import layers
 from ..fluid.dygraph.parallel import prepare_context
 import paddle
-from .fleet import fleet
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle import _C_ops
@@ -268,6 +267,10 @@ def new_group(ranks=None, backend=None):
                 place = core.NPUPlace(genv.device_id)
                 core.HCCLParallelContext(strategy,
                                          place).init_with_ring_id(ring_id)
+            elif core.is_compiled_with_mlu():
+                place = core.MLUPlace(genv.device_id)
+                core.CNCLParallelContext(strategy,
+                                         place).init_with_ring_id(ring_id)
             else:
                 assert False, ("no cuda device found")
         else:
@@ -1422,6 +1425,7 @@ def split(x,
             "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
             "ParallelColumnLinear instead.")
     else:
+        from .fleet import fleet
         assert fleet._role_maker, ("To use paddle.distributed.split, "
                                    "you must call fleet.init() firstly.")
         rank = fleet.worker_index()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index 112c3887fcfa5d..a31f8bbfed0c91 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -40,8 +40,6 @@
     Type.fp32.value: 4,
 }
 
-__all__ = ["ShardingOptimizerStage2"]
-
 
 class ShardingOptimizerStage2(Optimizer):
     """
@@ -136,7 +134,7 @@ def __init__(self,
         # Update optimizer parameters and adjust parameter storage and use according to rank.
         self._update_opt_status()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_params_and_buffers(self):
         """
         Sync all model states for all ranks
@@ -392,7 +390,7 @@ def _clear_cache(self):
         self._dtype_rank_params.clear()
         self._param2rank.clear()
 
-    @fluid.dygraph.no_grad
+    @paddle.autograd.no_grad()
     def _broadcast_params(self):
         """Broadcast the parameters of the current rank to each rank"""
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
index 00937dbe7a4329..f786f665ad438c 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
@@ -31,14 +31,19 @@ def __init__(self, optimizer):
         self.inner_opt = optimizer
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = []
-        self.pass_ctx = PassContext()
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
         super(ParameterServerOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
+    def _set_origin_programs(self, losses):
+        self.origin_main_programs = []
+        for loss in losses:
+            self.origin_main_programs.append(loss.block.program)
+
     def _init_ps_pass_context(self, loss, startup_program):
+        self.pass_ctx = PassContext()
         attrs = {}
         # trainer
         attrs["env"] = get_dist_env()
@@ -46,9 +51,9 @@ def _init_ps_pass_context(self, loss, startup_program):
         attrs['loss'] = loss
         attrs['min_block_size'] = 81920
         attrs['origin_main_program'] = loss.block.program
-        attrs['origin_main_programs'] = [loss.block.program]
         attrs['origin_startup_program'] = startup_program
-        attrs['origin_startup_programs'] = [startup_program]
+
+        attrs['origin_main_programs'] = self.origin_main_programs
 
         attrs['cloned_main'] = attrs['origin_main_program'].clone()
         attrs['cloned_startup'] = attrs['origin_startup_program'].clone()
@@ -90,10 +95,11 @@ def _is_graph_out(self):
         return False
 
     def _can_apply(self):
-        if self._attrs['role_maker']._is_collective or self._attrs[
-                'k_steps'] < 0:
+        if self.role_maker._is_collective:
             return False
-        return True
+
+        k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
+        return True if k_steps >= 0 else False
 
     def minimize_impl(self,
                       loss,
@@ -104,12 +110,37 @@ def minimize_impl(self,
                                 no_grad_set)
         if startup_program == None:
             startup_program = paddle.static.default_startup_program()
+        print("program after inner optimizer minimize:",
+              str(loss.block.program))
+        self._set_origin_programs([loss])
         self._init_ps_pass_context(loss, startup_program)
         ps_builder = PsProgramBuilderFactory()._create_ps_program_builder(
             self.pass_ctx)
         ps_builder._build_programs()
         return None, None
 
+    def minimize_losses_impl(self,
+                             losses,
+                             startup_program=None,
+                             parameter_list=None,
+                             no_grad_set=None):
+        if parameter_list is None:
+            parameter_list = [None] * len(losses)
+        for idx, loss in enumerate(losses):
+            startup_prog = startup_program[idx]
+            parameters = parameter_list[idx]
+            self.inner_opt.minimize(loss, startup_prog, parameters, no_grad_set)
+        self._set_origin_programs(losses)
+        for idx, loss in enumerate(losses):
+            print("ps_optimizer idx loss:", idx, loss)
+            startup_prog = startup_program[idx]
+            self._init_ps_pass_context(loss, startup_prog)
+            ps_builder = PsProgramBuilderFactory()._create_ps_program_builder(
+                self.pass_ctx)
+            ps_builder._build_programs()
+            startup_program[idx] = self.pass_ctx._attrs['cloned_startup']
+        return None, None
+
     def _can_apply_geo(self, program):
         def get_sys_free_mem():
             plat = platform.system()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index d04a3a53db3e2b..b42f21989abd77 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -901,9 +901,10 @@ def save_persistables(exe, dirname, main_program, filename=None):
     def is_opt_vars(var):
         # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer
         # now only Momentum and adam are compatible with sharding
+        # support EMA optimizer
         checks = [
             "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0",
-            "_velocity_0"
+            "_velocity_0", "_ema_0"
         ]
         for check in checks:
             if var.name.endswith(check) and var.persistable:
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 392a7f3ac5d8fe..548f036067eba7 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -63,8 +63,7 @@ def __init__(
             sync_buffers=False,
             buffer_max_size=2**23,  #8MB
             auto_refresh_trainable=True,
-            device="gpu",
-            use_grad_storage=True):
+            device="gpu"):
         super().__init__()
 
         # training options
@@ -102,9 +101,10 @@ def __init__(
         # Set grad storage size & Display param sizes and model sizes
         model_size = sum(
             [np.prod(p.shape) for p in self._layer.parameters()]).item()
+        assert buffer_max_size >= 0, "buffer_max_size must be GE than 0."
         self._buffer_max_size = self._rank_buffer_size(buffer_max_size,
                                                        model_size)
-        self._use_grad_storage = use_grad_storage
+        self._use_grad_storage = buffer_max_size > 0
         self._grad_storages = {}  # {dtype: {rank: GradStorage}}
         self._has_grad_storage = []
         self._grad_storage_list = []
@@ -255,7 +255,7 @@ def _fresh_trainable(self):
         # wait next func hook support
         self._setup_backward_hooks()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def __sync_buffers(self):
         """
         Sync all the param buffers from all ranks (exp: batch norm statistics).
@@ -277,7 +277,7 @@ def __getattr__(self, name):
         except AttributeError:
             return getattr(self._layer, name)
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _clear_counters(self):
         """Reset all the grad reduce and call counters."""
         if self.training:
@@ -290,13 +290,13 @@ def _clear_counters(self):
     def _get_reduce_fn(self, index, param, dst_rank):
         """
         There are two ways to reduce gradient.
-        - 1. Do not use use_grad_storage or exceeded buffer_max_size will be reduced separately.
+        - 1. Do not use self._use_grad_storage or exceeded buffer_max_size will be reduced separately.
         - 2. Use grad_storage Reduce the storage to get the full gradient from different ranks.
         """
 
         if not self._use_grad_storage or not self._has_grad_storage[index]:
             # Direct reduction
-            @paddle.no_grad()
+            @paddle.autograd.no_grad()
             def reduce(*_):
                 # Skip gradient reduction, do not change status information
                 if self._grad_reduced[index]:
@@ -336,7 +336,7 @@ def cleanup():
 
         else:
             # Buffer reduction
-            @paddle.no_grad()
+            @paddle.autograd.no_grad()
             def reduce(*_):
                 # Skip gradient reduction, do not change status information
                 if self._grad_reduced[index]:
@@ -421,9 +421,6 @@ def _setup_use_grad_storage(self):
         Integrate the parameters gradient into a continuous memory according to rank, and support the update of training parameters.
         """
 
-        if not self._use_grad_storage:
-            return
-
         # According to parameters's numel sort, allocate memory of parameter gradient to continuous memory according to rank
         self._grad_storages = {}
         self._has_grad_storage = [False for _ in self._trainable_params]
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index de69836fdba14d..bcf63a54cc4ec4 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -84,6 +84,7 @@ def __init__(self,
         self._offload = offload
         self._sync_comm = sync_comm
         # segmentation size
+        assert segment_size >= 0, "segment_size must be GE than 0."
         self._segment_size = segment_size
 
         global DEV
@@ -158,7 +159,7 @@ def __init__(self,
         self._redefine_opt_step()
         self._redefine_opt_clear()
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_params_and_buffers(self):
         """
         Sync all model states for all ranks
@@ -408,7 +409,7 @@ def _forward_post_hook(layer, inputs, outputs):
         # register post forward hooks
         sub_layer.register_forward_post_hook(_forward_post_hook)
 
-    @paddle.no_grad()
+    @paddle.autograd.no_grad()
     def _sync_buffers(self):
         """
         Sync all the param buffers from all ranks (exp: batch norm statistics).
@@ -521,7 +522,7 @@ def _register_backward_hooks(self):
             param._register_backward_hook(allreduce_function)
 
     def _get_allreduce_fn(self, param):
-        @paddle.no_grad()
+        @paddle.autograd.no_grad()
         def reduce(*_):
             if param.name in self._task_flow.full_grad.keys():
                 full_grad = self._task_flow.full_grad[param.name]
@@ -840,7 +841,7 @@ def _allgather_buffer(trainable_params,
     return task_flow
 
 
-@paddle.no_grad()
+@paddle.autograd.no_grad()
 def _create_params_grad(trainable_params, param2buffer_size, task_flow):
     for param in trainable_params:
         if param.name in task_flow.full_grad.keys():
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 177e19194a5227..16ed528b64f0c8 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -58,9 +58,9 @@ def _start_kv_server(port, http_server_d, size):
 
 def _is_cpuonly(backend):
     check_backend(backend)
-    if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter'] and (
+    if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] and (
             core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or
-            core.is_compiled_with_npu()):
+            core.is_compiled_with_npu() or core.is_compiled_with_mlu()):
 
         # passes 'auto' and can use cuda or xpu, use the default logics. so return False
         return False
@@ -152,7 +152,8 @@ def train():
     is_cpu_only = _is_cpuonly(backend)
     # 1. gpu xpu check, must be gpu or xpu, 
     if not (is_cpu_only or core.is_compiled_with_cuda() or
-            core.is_compiled_with_xpu() or core.is_compiled_with_npu()):
+            core.is_compiled_with_xpu() or core.is_compiled_with_npu() or
+            core.is_compiled_with_mlu()):
         raise NotImplementedError(
             "If you want to use CPU-only version, please use 'gloo' as backend")
 
@@ -162,6 +163,8 @@ def train():
         _check_var_exists('FLAGS_selected_xpus')
     elif not is_cpu_only and core.is_compiled_with_npu():
         _check_var_exists('FLAGS_selected_npus')
+    elif not is_cpu_only and core.is_compiled_with_mlu():
+        _check_var_exists('FLAGS_selected_mlus')
 
     _check_var_exists("PADDLE_TRAINER_ID")
     _check_var_exists("PADDLE_CURRENT_ENDPOINT")
@@ -213,6 +216,8 @@ def train():
         place = core.XPUPlace(parallel_env.device_id)
     elif core.is_compiled_with_npu():
         place = core.NPUPlace(parallel_env.device_id)
+    elif core.is_compiled_with_mlu():
+        place = core.MLUPlace(parallel_env.device_id)
 
     _set_expected_place(place)
     # init nccl or hccl or bkcl or heter context
@@ -231,6 +236,9 @@ def train():
     elif core.is_compiled_with_npu():
         parallel_helper._set_parallel_ctx(
             core.HCCLParallelContext(strategy, place))
+    elif core.is_compiled_with_mlu():
+        parallel_helper._set_parallel_ctx(
+            core.CNCLParallelContext(strategy, place))
 
     if backend != "heter":
         other_endpoints = strategy.trainer_endpoints[:]
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 284365ce06651a..6f72cf1b159709 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -74,6 +74,8 @@ def _append_barrier_op(self, program, dummys, trainer_id):
 
     def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         attrs = pass_ctx._attrs
+        print("pass loss program id:", id(attrs['loss'].block.program))
+        print("pass main program id:", id(main_program))
         ps_mode = attrs['ps_mode']
         if ps_mode == DistributedMode.GEO:
             send_ctx = get_geo_trainer_send_context(attrs)  # geo 模式
@@ -84,6 +86,8 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         for merged_name, send in send_ctx.items():
             if send.is_sparse() and ps_mode != DistributedMode.GEO:
                 continue
+            if send.program_id() != id(attrs['loss'].block.program):
+                continue
             logger.info('merged_name, send: {}, {}'.format(merged_name, send))
             is_sparse = 1 if send.is_sparse() else 0
             is_sparse = 2 if send.is_distributed() else is_sparse
@@ -496,6 +500,7 @@ def _add_lr_var(self, main_program, attrs):
             persistable=True)
 
     def _apply_single_impl(self, main_program, startup_program, pass_ctx):
+        print("delete_optimizer_pass")
         attrs = pass_ctx._attrs
         optimizer_ops = get_optimize_ops(main_program)
         lr_ops = get_lr_ops(main_program)
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index cc744bc9d9edbd..5170684b4325c1 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -40,12 +40,12 @@ def get_program_by_id(context, program_id):
     programs = context["origin_main_programs"]
     for i, program in enumerate(programs):
         if id(program) == program_id:
-            return program, context["origin_startup_programs"][i]
-    return None, None
+            return program, context["origin_startup_programs"][i], i
+    return None, None, None
 
 
 def parse_table_class(varname, program_id, context):
-    main_program, startup_program = get_program_by_id(context, program_id)
+    main_program, startup_program, idx = get_program_by_id(context, program_id)
     for op in main_program.global_block().ops:
         if not is_distributed_sparse_op(op) and not is_sparse_op(op):
             continue
@@ -60,7 +60,7 @@ def parse_table_class(varname, program_id, context):
 
 
 def check_embedding_dim(accessor_proto, varname, program_id, context):
-    main_program, startup_program = get_program_by_id(context, program_id)
+    main_program, startup_program, idx = get_program_by_id(context, program_id)
     embedding_dim = 0
     for var in main_program.list_vars():
         if var.name == varname:
@@ -94,10 +94,9 @@ def _set(self, service_proto):
 
 class GpuService(Service):
     def __init__(self):
-        super(GpuService).__init__(self)
+        super(GpuService, self).__init__()
 
     def _set(self, service_proto):
-        super(GpuService)._set(service_proto)
         service_proto.server_class = 'PsLocalServer'
         service_proto.client_class = 'PsLocalClient'
 
@@ -111,7 +110,8 @@ def __init__(self):
 
     # TableAccessorParameter accessor
     def _set(self, accessor_proto, varname, program_id, context):
-        main_program, startup_program = get_program_by_id(context, program_id)
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               program_id)
         embedding_dim = 0
         for var in main_program.list_vars():
             if var.name == varname:
@@ -236,7 +236,8 @@ def define_optimize_map(self):
         self.opt_init_map = opt_init_map
 
     def parse_entry(self, varname, program_id, context):
-        main_program, startup_program = get_program_by_id(context, program_id)
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               program_id)
         for op in main_program.global_block().ops:
             if not is_distributed_sparse_op(op) and not is_sparse_op(op):
                 continue
@@ -290,8 +291,8 @@ def parse_by_optimizer(self, ctx, context):
         print("parse_by_optimizer table_id:{} is_datanorm:{}".format(
             ctx.table_id(), ctx.is_datanorm_table()))
 
-        main_program, startup_program = get_program_by_id(context,
-                                                          ctx.program_id())
+        main_program, startup_program, idx = get_program_by_id(context,
+                                                               ctx.program_id())
         pserver_id = get_role_id(context['role_maker'])
         pserver_num = len(get_ps_endpoints(context['role_maker']))
         optimizer_ops = get_optimize_ops(main_program)
@@ -359,10 +360,11 @@ def parse_by_optimizer(self, ctx, context):
                     param = main_program.global_block().vars[oop.input(
                         formal_name)[0]]
                     #TODO: for dense learning_rate, can be different from sparse lr
-                    if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                    if formal_name == "LearningRate" and param.name != "learning_rate_" + str(
+                            idx):
                         warnings.warn("will support decay soon")
                         param = main_program.global_block().vars[
-                            "learning_rate_0"]
+                            "learning_rate_" + str(idx)]
 
                     initializer = self.get_initializer_attr(param.name,
                                                             startup_program)
@@ -404,10 +406,11 @@ def parse_by_optimizer(self, ctx, context):
                 else:
                     param = main_program.global_block().vars[oop.input(
                         formal_name)[0]]
-                    if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                    if formal_name == "LearningRate" and param.name != "learning_rate_" + str(
+                            idx):
                         warnings.warn("will support decay soon")
                         param = main_program.global_block().vars[
-                            "learning_rate_0"]
+                            "learning_rate_" + str(idx)]
 
                     if shape is None:
                         if is_sparse:
@@ -707,6 +710,7 @@ def __init__(self, context):
         self.ps_mode = context['ps_mode']
         self.is_heter_ps_mode = context['is_heter_ps_mode']
         self.use_ps_gpu = context['use_ps_gpu']
+        self.barrier_table_id = None
         self.send_ctx = get_the_one_send_context(
             self.context,
             use_origin_program=True,
@@ -767,6 +771,8 @@ def build_worker_desc(self):
             table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add(
             )
             table._set(table_proto)
+            if type(table) == BarrierTable and self.barrier_table_id is None:
+                self.barrier_table_id = table.idx
         self.service._set(
             self.ps_desc.server_param.downpour_server_param.service_param)
         return text_format.MessageToString(self.ps_desc)
@@ -820,9 +826,9 @@ def _set_basic_info(self, context):
         self.context['tensor_table'] = {}
         build_var_distributed(self.context)
 
-        endpoints = get_ps_endpoints(self.role_maker)
+        self.endpoints = get_ps_endpoints(self.role_maker)
         self.string_hosts = []
-        for idx, ep in enumerate(endpoints):
+        for idx, ep in enumerate(self.endpoints):
             host, port = ep.split(":")
             pshost = fluid.core.PSHost(host, int(port), idx)
             self.string_hosts.append(pshost.serialize_to_string())
@@ -848,7 +854,7 @@ def sync_strategy_envs():
             kwargs["trainer_id"] = self.role_maker._worker_index()
             return kwargs
 
-        proto_txt = worker_desc + "\n" + server_desc
+        proto_txt = worker_desc
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
         if debug:
             print("worker: \n{}".format(proto_txt))
@@ -859,7 +865,7 @@ def sync_strategy_envs():
             self.context,
             split_dense_table=self.is_heter_ps_mode,
             use_origin_program=self.is_heter_ps_mode,
-            ep_list=endpoints)
+            ep_list=self.endpoints)
         trainer_config = self.context['trainer']
 
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
@@ -876,10 +882,7 @@ def sync_strategy_envs():
         kwargs["trainer_id"] = self.role_maker._role_id()
         kwargs["trainers"] = self.role_maker._worker_num()
 
-        for table in server.servers[0].tables:  #TODO
-            if table.table_class == "BarrierTable":
-                kwargs["barrier_table_id"] = table.id
-                break
+        kwargs["barrier_table_id"] = self.ps_desc_builder.barrier_table_id
 
         if self.context['ps_mode'] == DistributedMode.SYNC:
             sync_kwargs = sync_strategy_envs()
@@ -1009,7 +1012,7 @@ def is_valid(var):
             if origin_varname.endswith("@GRAD"):
                 return False
 
-            if origin_varname == "learning_rate_0":
+            if origin_varname.startswith("learning_rate_"):
                 return False
 
             if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
@@ -1113,7 +1116,7 @@ def _ps_inference_save_persistables(self,
                 "in fleet.save() function, executor must be as Executor type")
 
         if main_program is None:
-            main_program = self.context['origin_ps_main_program']
+            main_program = self.context['origin_main_program']
 
         if isinstance(main_program, CompiledProgram):
             raise TypeError(
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index d737542f323448..ff99f9d071e2f4 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -88,7 +88,7 @@ def _build_trainer_programs(self):
         self.attrs['origin_main_program'] = self.cloned_main
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
@@ -103,10 +103,13 @@ def __init__(self, pass_ctx):
                              format(self.ps_mode, "PsProgramBuilder"))
 
     def _build_trainer_programs(self):
+        print("build trainer program entry")
+        print("before ps program builder program:", self.cloned_main)
         add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
                                            self.attrs)
         add_lr_decay_table_pass.apply([], [], self.pass_ctx)
 
+        print("before distributed op pass")
         distributed_ops_pass = new_pass("distributed_ops_pass", self.attrs)
         distributed_ops_pass.apply([self.cloned_main], [None], self.pass_ctx)
 
@@ -126,9 +129,10 @@ def _build_trainer_programs(self):
 
         self.attrs['origin_main_program'] = self.cloned_main
         self.attrs['origin_startup_program'] = self.cloned_startup
+        print("after ps program builder program:", self.cloned_main)
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
@@ -167,7 +171,7 @@ def _build_trainer_programs(self):
         self.attrs['origin_startup_program'] = self.cloned_startup
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
@@ -220,7 +224,7 @@ def _build_trainer_programs(self):
                                           [self.cloned_startup], self.pass_ctx)
 
         if self.launch_barrier and self.launch_barrier_flag:
-            wait_server_ready(server_endpoints)
+            wait_server_ready(self.server_endpoints)
 
         return
 
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index ab5bd7da09dfc1..7839c8520c68ff 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -450,9 +450,8 @@ def get_the_one_send_context(context,
     idx = 0
     for i, program in enumerate(origin_programs):
         merged_dense_pairs = context['merged_dense_pairs'][i]
-        idx += get_dense_send_context(program, send_ctx, idx,
-                                      merged_dense_pairs, trainer_id,
-                                      split_dense_table)
+        idx = get_dense_send_context(program, send_ctx, idx, merged_dense_pairs,
+                                     trainer_id, split_dense_table)
     distibuted_varnames = get_sparse_tablenames(origin_programs, True)
     print("public distibuted_varnames:", distibuted_varnames)
     for i, program in enumerate(origin_programs):
diff --git a/python/paddle/distributed/sharding/__init__.py b/python/paddle/distributed/sharding/__init__.py
new file mode 100644
index 00000000000000..d14e3dd099ffe2
--- /dev/null
+++ b/python/paddle/distributed/sharding/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .group_sharded import group_sharded_parallel, save_group_sharded_model  # noqa: F401
+
+__all__ = ['group_sharded_parallel', 'save_group_sharded_model']
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
new file mode 100644
index 00000000000000..2fdb20600f673b
--- /dev/null
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+from enum import Enum
+
+import paddle
+
+from paddle.optimizer import Optimizer
+from paddle.distributed.utils import get_logger
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+logger_ = get_logger(logging.INFO)
+
+
+def group_sharded_parallel(model,
+                           optimizer,
+                           level,
+                           scaler=None,
+                           group=None,
+                           offload=False,
+                           sync_buffers=False,
+                           buffer_max_size=2**23,
+                           segment_size=2**20,
+                           sync_comm=False):
+    """
+    Use this module to configure and wrap up the parameters of the group shared module.
+
+    Args:
+        model (Layer): The layer to be wrapped with group_sharded_parallel.
+        optimizer (Optimizer): The optimizer to be wrapped with group_sharded_parallel.
+        level (str): The different level of the group sharded. Such as `os`, `os_g`, `p_g_os`.
+        scaler (GradScaler, optional): The scaler to be wrapped with group_sharded_parallel. Defaults to None.
+        group (Group, optional): The group instance. Defaults to None.d
+        offload (bool, optional): Whether to perform optimizer state and gradient transfer CPU. Defaults to False.
+        sync_buffers (bool, optional): Whether to broadcast model buffers. Defaults to False.
+        buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. Defaults to 2**23.
+        segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20.
+        sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False.
+    
+    Returns:
+        model: A wrapper for group sharded given model.
+        optimizer: A wrapper for group sharded given optimizer.
+        scaler: A wrapper for group sharded given scaler.
+    
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            from paddle.fluid.dygraph.nn import Linear
+            from paddle.distributed import fleet
+            from paddle.distributed.sharding import group_sharded_parallel
+
+            fleet.init(is_collective=True)
+            group = paddle.distributed.new_group([0, 1])
+            model = Linear(1000, 1000)
+
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
+
+            # wrap sharding model, optimizer and scaler
+            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
+
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            out = model(img)
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+    """
+    # check optition type
+    assert isinstance(
+        model,
+        paddle.nn.Layer), "The model must be the instance of paddle.nn.Layer."
+    assert isinstance(
+        optimizer, Optimizer
+    ), "The optimizer must be the instance of paddle.optimizer.Optimizer."
+    assert level in ['os', 'os_g', 'p_g_os'
+                     ], "The level must be os, os_g or p_g_os."
+
+    def check_dtype(param):
+        return param.dtype == paddle.float16
+
+    params_fp16 = filter(check_dtype, model.parameters())
+    if scaler is None and len(params_fp16) > 0:
+        raise ValueError("Please enter the correct scaler.")
+    # convert model/optimizer/scaler
+    if level in ['os', 'os_g']:
+        logger_.info("*" * 30)
+        logger_.info("Sharded level os uses sharded level os_g achieved now.")
+        logger_.info("*" * 30)
+        optimizer = ShardingOptimizerStage2(
+            params=model.parameters(),
+            optim=optimizer,
+            group=group,
+            offload=offload)
+        model = ShardingStage2(
+            model,
+            optimizer,
+            group=group,
+            sync_buffers=sync_buffers,
+            buffer_max_size=buffer_max_size)
+    elif level == 'p_g_os':
+        model = ShardingStage3(
+            model,
+            optimizer=optimizer,
+            group=group,
+            sync_buffers=sync_buffers,
+            segment_size=segment_size,
+            offload=offload,
+            sync_comm=sync_comm)
+    else:
+        raise ValueError("Please enter the correct level.")
+    if params_fp16 and isinstance(scaler, paddle.amp.GradScaler):
+        scaler = ShardingScaler(scaler)
+    logger_.info("*" * 30)
+    logger_.info(
+        "If there is a communication hang using group sharded, please check whether the communication operations of each process are unified."
+    )
+    logger_.info("*" * 30)
+
+    return model, optimizer, scaler
+
+
+def save_group_sharded_model(model, output, optimizer=None):
+    """
+    Group sharded encapsulated model and optimizer state saving module.
+
+    Args:
+        model (Layer): A wrapper for group sharded given model.
+        output (str): Save directory.
+        optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None.
+    
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            from paddle.fluid.dygraph.nn import Linear
+            from paddle.distributed import fleet
+            from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model
+
+            fleet.init(is_collective=True)
+            group = paddle.distributed.new_group([0, 1])
+            model = Linear(1000, 1000)
+
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+            optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
+
+            # wrap sharding model, optimizer and scaler
+            model, optimizer, scaler = group_sharded_parallel(model, optimizer, "p_g", scaler=scaler)
+
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            out = model(img)
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+
+            # save model and optimizer state_dict
+            save_group_sharded_model(model, optimizer，output=output_dir)
+    """
+    logger_.info(
+        "==========Begin to save group sharded model and optimizer==========")
+    assert not os.path.isfile(
+        output
+    ), "Saving directory ({}) should be a directory, not a file".format(output)
+    os.makedirs(output, exist_ok=True)
+    output_model = os.path.join(output, "model.pdmodel")
+    if isinstance(model, ShardingStage2):
+        paddle.save(model._layer.state_dict(), output_model)
+    elif isinstance(model, ShardingStage3):
+        convert2cpu = True if model._offload else False
+        model.get_all_parameters(convert2cpu=convert2cpu)
+        paddle.save(model._layer.state_dict(), output_model)
+    else:
+        raise ValueError(
+            "Please use the layer which is wrapped with group_sharded_parallel.")
+
+    if optimizer is not None:
+        assert hasattr(
+            optimizer, "_optim"
+        ), "Please use the optimizer which is wrapped with group_sharded_parallel."
+        output_opt = os.path.join(output, "model.pdopt")
+        paddle.save(optimizer._optim.state_dict(), output_opt)
+    logger_.info(
+        "==========End to save group sharded model and optimizer==========")
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 53f4a93f6480e8..ae40a42e9d5074 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -546,13 +546,15 @@ def get_visible_gpus(self):
 
 def get_logger(log_level, name="root"):
     logger = logging.getLogger(name)
-    logger.setLevel(log_level)
-
-    log_handler = logging.StreamHandler()
-    log_format = logging.Formatter(
-        '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
-    log_handler.setFormatter(log_format)
-    logger.addHandler(log_handler)
+    # Avoid printing multiple logs
+    if not logger.handlers:
+        logger.setLevel(log_level)
+
+        log_handler = logging.StreamHandler()
+        log_format = logging.Formatter(
+            '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
+        log_handler.setFormatter(log_format)
+        logger.addHandler(log_handler)
 
     return logger
 
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 5e023e9248cab5..617ab6305289f9 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -283,6 +283,7 @@ def to_list(s):
         from .core_avx import _set_cached_executor_build_strategy
         from .core_avx import _device_synchronize
         from .core_avx import _get_current_stream
+        from .core_avx import _Profiler, _ProfilerResult, _RecordEvent
         from .core_avx import _set_current_stream
         if sys.platform != 'win32':
             from .core_avx import _set_process_pids
@@ -344,6 +345,7 @@ def to_list(s):
         from .core_noavx import _device_synchronize
         from .core_noavx import _get_current_stream
         from .core_noavx import _set_current_stream
+        from .core_noavx import _Profiler, _ProfilerResult, _RecordEvent
         if sys.platform != 'win32':
             from .core_noavx import _set_process_pids
             from .core_noavx import _erase_process_pids
diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py
index 3debeecfe4f382..3a23c852563daa 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@@ -113,8 +113,6 @@ def __init__(self,
             assert not shuffle, "shuffle should be False when sampler is set"
             self.sampler = sampler
         else:
-            assert isinstance(dataset, Dataset), \
-                "dataset should be a paddle.io.Dataset"
             assert not isinstance(dataset, IterableDataset), \
                 "dataset should not be a paddle.io.IterableDataset"
             assert sampler is None, \
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index f43a51063b00ac..191661b7bf9d5a 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -411,9 +411,9 @@ def amp_decorate(models,
         import paddle
 
         model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
-        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+        optimizer = paddle.optimizer.SGD(parameters=model.parameters())
 
-        model, optimizer = paddle.fluid.dygraph.amp_decorate(models=model, optimizers=optimzier, level='O2')
+        model, optimizer = paddle.fluid.dygraph.amp_decorate(models=model, optimizers=optimizer, level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
@@ -426,7 +426,7 @@ def amp_decorate(models,
         model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
         optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
 
-        models, optimizers = paddle.fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+        models, optimizers = paddle.fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimizer, optimizer2], level='O2')
 
         data = paddle.rand([10, 3, 32, 32])
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 04474dcdfe5091..d440e387da597d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -191,7 +191,7 @@ def is_api_in_module(node, module_prefix):
 
         return eval("_is_api_in_module_helper({}, '{}')".format(func_str,
                                                                 module_prefix))
-    except NameError:
+    except Exception:
         return False
 
 
@@ -227,7 +227,7 @@ def is_numpy_api(node):
         # TODO: find a better way
         if not module_result:
             return func_str.startswith("numpy.") or func_str.startswith("np.")
-    except NameError:
+    except Exception:
         return False
 
 
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index aad7737350961b..f58952d3036c50 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -885,7 +885,7 @@ def _run_dygraph(instance, input, program_holder):
             'start_op_index': 0,
             'end_op_index': end_op_index,
             'is_test': instance._is_test,
-            'program_id': _hash_with_id(trace_program)
+            'program_id': _hash_with_id(trace_program, instance)
         })
     # NOTE: [ why need set param's gradient type here ]
     # if user set sparse gradient mode, the param's gradient
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 0049f387b707fc..652916491eed7e 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -128,6 +128,9 @@ def __init__(self):
         elif core.is_compiled_with_npu():
             selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
             self._device_id = int(selected_npus[0])
+        elif core.is_compiled_with_mlu():
+            selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
+            self._device_id = int(selected_mlus[0])
 
         self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                             "").split(",")
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 727ceca72d1f1c..cbea289162c849 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -332,8 +332,6 @@ def __init__(self,
         self.use_buffer_reader = use_buffer_reader
         self.worker_init_fn = worker_init_fn
 
-        assert isinstance(dataset, Dataset), \
-            "dataset should be subclass instance of paddle.io.Dataset"
         self.dataset = dataset
 
         if not return_list and not in_dygraph_mode():
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1443eebf293847..9b0c857576b8ac 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,6 +23,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_pipeline)
 list(APPEND DIST_TEST_OPS test_ir_pass_pipeline)
 list(APPEND DIST_TEST_OPS test_static_model_parallel)
+list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_feedforward)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
@@ -46,6 +47,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3)
+list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api)
 list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
@@ -125,6 +127,17 @@ if(NOT WITH_GPU)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
     LIST(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
+endif()
+
+if (WITH_GPU)
+    if (CUDA_VERSION LESS 11.6)
+        LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
+        LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op)
+        LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass)
+    endif()
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
@@ -270,6 +283,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3)
+    list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api)
     list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
@@ -1102,15 +1116,16 @@ set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
-    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120)
@@ -1139,6 +1154,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
         set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120)
         set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
+        set_tests_properties(test_static_model_parallel_fused_feedforward PROPERTIES TIMEOUT 120)
         set_tests_properties(test_collective_split_embedding
             test_collective_split_embedding_none_divisible
             test_collective_split_row_linear
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 0a9eaf34ba512b..80bc206ae7b795 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -5,7 +5,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
     py_test_modules(test_relaunch_with_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
     set_tests_properties(test_relaunch_with_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
-    py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
+    py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_gpt_planner ENVS ${dist_ENVS})
     set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
     py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
+    set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
new file mode 100644
index 00000000000000..8c71c792bf07d0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import time
+import paddle.fluid as fluid
+import copy
+import os
+import numpy as np
+import subprocess
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+from paddle.fluid import layers
+from paddle.io import Dataset, IterableDataset, DataLoader
+from paddle.static import InputSpec
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.engine import Engine
+
+paddle.enable_static()
+global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+PP_MESH_0 = auto.ProcessMesh([0])
+PP_MESH_1 = auto.ProcessMesh([1])
+batch_size = 1
+batch_num = 10
+hidden_size = 1024
+sequence_len = 512
+image_size = hidden_size
+class_num = 10
+
+paddle.seed(44)
+
+
+class MyDataset(Dataset):
+    def __init__(self, num_samples):
+        super(MyDataset, self).__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=image_size).astype("float32")
+        label = np.random.randint(0, class_num - 1, dtype="int64")
+        return input, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        out = auto.shard_op(
+            self.norm, dist_attr={"process_mesh": PP_MESH_0})(input)[0]
+        out = self.linear0(input)
+        out = F.gelu(out, approximate=True)
+        out = auto.shard_op(
+            self.linear1, dist_attr={"process_mesh": PP_MESH_1})(out)[0]
+        out = self.dropout(out)
+        out = self.linear2(out)
+        return out
+
+
+def train():
+    mlp = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+    loss = paddle.nn.CrossEntropyLoss()
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(
+        learning_rate=0.00001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        grad_clip=None)
+
+    dataset = MyDataset(batch_num * batch_size)
+    data_spec = [
+        InputSpec([batch_size, hidden_size], 'float32', 'x'),
+        InputSpec([batch_size], 'int64', 'label')
+    ]
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.amp = False
+    dist_strategy.pipeline = False
+    dist_strategy.recompute = False
+    # init parallel optimizer
+    dist_strategy.semi_auto = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+
+    engine = Engine(mlp, data_spec, strategy=dist_strategy)
+    engine.prepare(optimizer, loss)
+    engine.fit(dataset,
+               batch_size=batch_size,
+               steps_per_epoch=batch_num * batch_size)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
index 0fc1ea41033e00..a7d51a7e176d47 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,122 +13,35 @@
 # limitations under the License.
 
 import unittest
-import time
-import paddle.fluid as fluid
-import copy
 import os
-import numpy as np
+import sys
+import shutil
 import subprocess
-import paddle
-import paddle.nn as nn
-import paddle.fluid as fluid
-import paddle.static as static
-import paddle.nn.functional as F
-import paddle.utils as utils
-from paddle.fluid import layers
-from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
-
-paddle.enable_static()
-global_process_mesh = auto.ProcessMesh(mesh=[0])
-batch_size = 1
-batch_num = 10
-hidden_size = 1024
-sequence_len = 512
-image_size = hidden_size
-class_num = 10
-
-paddle.seed(44)
-
-
-class MyDataset(Dataset):
-    def __init__(self, num_samples):
-        super(MyDataset, self).__init__()
-        self.num_samples = num_samples
-
-    def __getitem__(self, index):
-        input = np.random.uniform(size=image_size).astype("float32")
-        label = np.random.randint(0, class_num - 1, dtype="int64")
-        return input, label
-
-    def __len__(self):
-        return self.num_samples
-
-
-class MLPLayer(nn.Layer):
-    def __init__(self,
-                 hidden_size=1024,
-                 intermediate_size=4 * 1024,
-                 dropout_ratio=0.1,
-                 initializer_range=0.02):
-        super(MLPLayer, self).__init__()
-        d_model = hidden_size
-        dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
-        bias_attr = None
-
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
-        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
-        # self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
-        # self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
-
-    def forward(self, input):
-        auto.shard_tensor(
-            input,
-            dist_attr={
-                "process_mesh": global_process_mesh,
-                "dims_mappig": [-1]
-            })
-        # out = self.norm(input)
-        out = self.linear0(input)
-        out = F.gelu(out, approximate=True)
-        out = self.linear1(out)
-        # out = self.dropout(out)
-        out = self.linear2(out)
-        return out
+from paddle.distributed.fleet.launch_utils import run_with_coverage
 
 
 class TestEngineAPI(unittest.TestCase):
     def test_engine_api(self):
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02)
-        loss = paddle.nn.CrossEntropyLoss()
-        optimizer = paddle.fluid.optimizer.AdamOptimizer(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=None)
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "engine_api.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
 
-        dataset = MyDataset(batch_num * batch_size)
-        data_spec = [
-            InputSpec([batch_size, hidden_size], 'float32', 'x'),
-            InputSpec([batch_size], 'int64', 'label')
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "launch", "--gpus", "0,1", launch_model_path
         ]
 
-        dist_strategy = fleet.DistributedStrategy()
-        dist_strategy.amp = False
-        dist_strategy.pipeline = False
-        dist_strategy.recompute = False
-        # init parallel optimizer
-        dist_strategy.semi_auto = True
-        fleet.init(is_collective=True, strategy=dist_strategy)
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
 
-        engine = Engine(mlp, data_spec, strategy=dist_strategy)
-        engine.prepare(optimizer, loss)
-        engine.fit(dataset,
-                   batch_size=batch_size,
-                   steps_per_epoch=batch_num * batch_size)
+        # Remove unnecessary files
+        log_path = os.path.join(file_dir, "log")
+        if os.path.exists(log_path):
+            shutil.rmtree(log_path)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
index 2277c69674b3fa..22692fa5debfcc 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
@@ -32,6 +32,7 @@
 from paddle.distributed.auto_parallel.utils import save_distributed_checkpoint, load_distributed_checkpoint, load_checkpoint_into_program
 from paddle.distributed.auto_parallel.utils import get_dist_attr, merge_and_slice_parameter, load_parameter_into_program
 from paddle.distributed.auto_parallel.reshard import HAS_SENT, HAS_RECV, HAS_ALLGATHER
+from paddle.distributed.auto_parallel.dist_context import set_default_distributed_context
 
 paddle.enable_static()
 _global_parallel_strategy = None
@@ -185,6 +186,7 @@ def tearDown(self):
             str(paddle.distributed.get_rank())))
 
     def test_mlp_mp2pp(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
@@ -211,6 +213,7 @@ def test_mlp_mp2pp(self):
                           fetch_list=[loss])
         last_res = res[0]
 
+        set_default_distributed_context(None)
         _global_parallel_strategy = "pp"
         _global_process_mesh = auto.ProcessMesh([0, 1])
         global PP_MESH_0
@@ -266,6 +269,7 @@ def tearDown(self):
             str(paddle.distributed.get_rank())))
 
     def test_mlp_pp2mp(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
@@ -302,6 +306,7 @@ def test_mlp_pp2mp(self):
         if paddle.distributed.get_rank() in [1]:
             last_res = res[0]
 
+        set_default_distributed_context(None)
         _global_parallel_strategy = "mp"
         _global_process_mesh = auto.ProcessMesh([0, 1])
 
@@ -345,6 +350,7 @@ def setUp(self):
         np.random.seed(2021)
 
     def test_input_invalid(self):
+        set_default_distributed_context(None)
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
index fd558ef0403296..877136cf6ed0ec 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
@@ -146,9 +146,13 @@ def test_ps_optimizer_minimize_gpu(self):
         self.config['ps_mode_config'] = "../ps/gpu_ps_config.yaml"
 
         self.config['debug_new_minimize'] = '0'
+        self.config['log_dir'] = ps_log_root_dir + "gpubox_log_old_minimize"
+        remove_path_if_exists(self.config['log_dir'])
         self.ps_launch("gpu-ps")
 
         self.config['debug_new_minimize'] = '1'
+        self.config['log_dir'] = ps_log_root_dir + "gpubox_log_new_minimize"
+        remove_path_if_exists(self.config['log_dir'])
         self.ps_launch("gpu-ps")
 
         file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
new file mode 100644
index 00000000000000..d4832782c329af
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import shutil
+import tempfile
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model
+
+epoch = 10
+paddle.seed(2022)
+np.random.seed(2022)
+base_lr = 0.1
+momentum_rate = 0.9
+l2_decay = 1e-4
+batch_size = 100
+fleet.init(is_collective=True)
+
+
+class MLP(fluid.Layer):
+    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        return y
+
+
+def reader_decorator(linear_size=1000):
+    def __reader__():
+        for _ in range(100):
+            img = np.random.rand(linear_size).astype('float32')
+            label = np.ones(1).astype('int64')
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(model, use_pure_fp16, opt_group=False):
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    optimizer = paddle.optimizer.Momentum(
+        parameters=[{
+            "params": list(model.parameters())
+        }] if opt_group else list(model.parameters()),
+        learning_rate=0.001,
+        weight_decay=0.00001,
+        grad_clip=clip,
+        multi_precision=use_pure_fp16)
+
+    return optimizer
+
+
+def train_mlp(model, shard_level, use_pure_fp16, output_dir):
+    group = paddle.distributed.new_group([0, 1])
+
+    optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
+    model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
+    scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+
+    model, optimizer, scaler = group_sharded_parallel(
+        model=model, optimizer=optimizer, level=shard_level, scaler=scaler)
+
+    train_reader = paddle.batch(
+        reader_decorator(), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+
+            if not use_pure_fp16:
+                avg_loss.backward()
+                optimizer.step()
+            else:
+                scaler.scale(avg_loss).backward()
+                scaler.step(optimizer)
+                scaler.update()
+
+            optimizer.clear_grad()
+
+    save_group_sharded_model(model, output=output_dir, optimizer=optimizer)
+    return model.parameters()
+
+
+def test_sharding_api():
+    mlp, mlp1, mlp2 = MLP(), MLP(), MLP()
+    state_dict = mlp.state_dict()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+
+    output_dir = tempfile.mkdtemp()
+
+    # fp16
+    stage2_params = train_mlp(
+        mlp1, shard_level="os_g", use_pure_fp16=True, output_dir=output_dir)
+    stage3_params = train_mlp(
+        mlp2, shard_level="p_g_os", use_pure_fp16=True, output_dir=output_dir)
+
+    for i in range(len(stage3_params)):
+        np.testing.assert_allclose(
+            stage2_params[i].numpy(),
+            stage3_params[i].numpy(),
+            rtol=1e-4,
+            atol=1e-3)
+    shutil.rmtree(output_dir)
+
+
+if __name__ == '__main__':
+    test_sharding_api()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
index 6b755cf4c2b593..bbbcb621fd466e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -83,7 +83,7 @@ def train_mlp(model,
               accumulate_grad=False,
               batch_size=100,
               opt_group=False,
-              recompute=False,
+              sync_comm=False,
               test_minimize=False):
     group = paddle.distributed.new_group([0, 1])
     if opt_group:
@@ -104,7 +104,7 @@ def train_mlp(model,
             model, optimizer, group=group, buffer_max_size=2**21)
     elif sharding_stage == 3:
         model = ShardingStage3(
-            model, optimizer=optimizer, group=group, sync_comm=recompute)
+            model, optimizer=optimizer, group=group, sync_comm=sync_comm)
 
     # check optimizer.minimize() error
     if test_minimize:
@@ -225,7 +225,7 @@ def test_stage2_stage3():
             rtol=1e-4,
             atol=1e-3)
 
-    # fp16 recompute
+    # fp16 sync_comm
     stage3_params = train_mlp(
         mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False)
     stage3_params_re = train_mlp(
@@ -233,7 +233,7 @@ def test_stage2_stage3():
         sharding_stage=3,
         use_pure_fp16=True,
         opt_group=False,
-        recompute=True)
+        sync_comm=True)
     for i in range(len(stage3_params)):
         np.testing.assert_allclose(
             stage3_params[i].numpy(), stage3_params_re[i].numpy(), rtol=1e-6)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
index 8b1560edfd81de..e34da7f70167a6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
@@ -16,14 +16,9 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,81 +27,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.5,
             "is_test": True,
             "dropout_implementation": "downgrade_in_infer"
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 dropout = paddle.fluid.layers.dropout(x, **self.attrs)
                 out = paddle.fluid.layers.elementwise_add(dropout, dropout)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.5,
             "is_test": True,
@@ -115,7 +117,7 @@ def set_attrs(self):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "dropout_prob": 0.0,
             "is_test": False,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
index 07b06d77c90ffb..a9d6d2308326ea 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
@@ -16,14 +16,9 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,101 +27,136 @@ class TestMul(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.init_op()
+        self.set_test_op()
+
+    @property
+    def fp16_enabled(self):
+        if IPUOpTest.use_ipumodel():
+            return False
+        else:
+            return True
 
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_mul
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = self.op(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def run_test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
     def test_case0(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(2, 3, 4, 5))
+
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.attrs = {}
         self.set_feed_attr()
         self.run_test_base()
 
     def test_case1(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(3, 4)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(3, 4))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": 1}
         self.run_test_base()
 
     def test_case2(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(5)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(5))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": -1}
         self.run_test_base()
 
     def test_case3(self):
-        self.feed = {
-            "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'),
-            "y": np.random.uniform(size=(2)).astype('float32'),
+        data_x = np.random.uniform(size=(2, 3, 4, 5))
+        data_y = np.random.uniform(size=(2))
+        self.feed_fp32 = {
+            "x": data_x.astype('float32'),
+            "y": data_y.astype('float32'),
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype('float16'),
+            "y": data_y.astype('float16'),
         }
         self.set_feed_attr()
         self.attrs = {"axis": 0}
@@ -134,37 +164,43 @@ def test_case3(self):
 
 
 class TestAdd(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_add
 
 
 class TestSub(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_sub
 
 
 class TestDiv(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_div
 
 
 class TestMin(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_min
 
 
 class TestMax(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_max
 
 
 class TestPow(TestMul):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_pow
 
 
 class TestMod(TestMul):
-    def init_op(self):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_test_op(self):
         self.op = paddle.fluid.layers.elementwise_mod
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
index c319894bfae250..5b18c73851324f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,94 +26,106 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.zeros([1, 10]).astype('float32'),
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.zeros([1, 10])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
-                # XX
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.equal(x, y, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.ones([1, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.ones([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.ones([1, 10]).astype('float32'),
-            "y": np.arange(0, 10).reshape([1, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.arange(0, 10).reshape([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
index 5b7ea61568ecd5..966dfdef87b54c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,125 +26,142 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[2, 3, 1]).astype('float32')}
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"expand_times": [1, 2, 2]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype="float32")
+
                 out = paddle.fluid.layers.expand(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[2, 2]).astype('float32')}
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 2])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                expand_times = fluid.layers.fill_constant(
+                    dtype="float32")
+
+                expand_times = paddle.fluid.layers.fill_constant(
                     shape=[len(self.feed_shape[0])], dtype="int32", value=2)
                 out = paddle.fluid.layers.expand(
                     x, expand_times=expand_times, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
new file mode 100644
index 00000000000000..00b855a5a7a429
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
@@ -0,0 +1,111 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {'fill_value': 0.3, 'dtype': 'float32'}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                x_fill = paddle.full_like(x, **self.attrs)
+                out = paddle.fluid.layers.elementwise_add(x_fill, x_fill)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {'fill_value': 3, 'dtype': 'int32'}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
index c62e0c08f9c79c..3a1c202bf1133a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,21 +26,23 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {}
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'name': 'x',
             'shape': [1, 3, 3, 3],
@@ -54,33 +50,34 @@ def set_attrs(self):
             'value': 0.3,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.fluid.layers.fill_constant(**self.attrs)
                 out = paddle.fluid.layers.elementwise_add(x, x)
-
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -89,19 +86,18 @@ def _test_base(self, run_ipu=True):
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'name': 'x',
             'shape': [1, 3, 3, 3],
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
new file mode 100644
index 00000000000000..6f0cafc66805e7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
@@ -0,0 +1,118 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 2, 4, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 1
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = paddle.fluid.layers.flatten(x=x, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
+
+        self.check(output_dict, check_shape=True)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 0
+
+
+class TestCase2(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = 2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
new file mode 100644
index 00000000000000..cd29ff705b88f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
@@ -0,0 +1,160 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import shutil
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'sgd'
+        self.attrs['path'] = 'model'
+        self.attrs['model_name'] = 'test'
+
+    def _test_save(self):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
+        self.full_name = '/'.join(
+            [self.attrs['path'], self.attrs['model_name']])
+
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
+                with paddle.static.program_guard(main_prog, startup_prog):
+                    x = paddle.static.data(
+                        name=self.feed_list[0],
+                        shape=self.feed_shape[0],
+                        dtype='float32')
+
+                    scale = paddle.fluid.layers.scale(
+                        x, scale=1.0, bias=0.0, bias_after_scale=True)
+                    conv = paddle.static.nn.conv2d(
+                        scale,
+                        num_filters=3,
+                        filter_size=3,
+                        bias_attr=False,
+                        name='conv2d')
+                    loss = paddle.mean(conv)
+
+                    if self.attrs['is_training']:
+                        if self.attrs['opt_type'] == 'sgd':
+                            sgd = paddle.optimizer.SGD(learning_rate=1e-2)
+                            sgd.minimize(loss)
+                        elif self.attrs['opt_type'] == 'adam':
+                            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                            adam.minimize(loss)
+                        elif self.attrs['opt_type'] == 'lamb':
+                            lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
+                            lamb.minimize(loss)
+
+                fetch_list = [loss.name]
+
+                place = paddle.IPUPlace()
+                exe = paddle.static.Executor(place)
+                exe.run(startup_prog)
+
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=True)
+                ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog, ipu_strategy=ipu_strategy).compile(
+                        self.feed_list, fetch_list)
+
+                for _ in range(self.attrs['steps']):
+                    exe.run(program, feed=self.feed_fp16, fetch_list=fetch_list)
+
+                paddle.static.save_inference_model(
+                    self.full_name, x, loss, exe, program=program.org_program)
+
+    def _test_load(self, run_ipu):
+        if run_ipu:
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+
+        [inference_program, feed_target_names, fetch_targets] = (
+            paddle.static.load_inference_model(self.full_name, exe))
+
+        if run_ipu:
+            feed_list = feed_target_names
+            fetch_list = [fetch_targets[0].name]
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(is_training=False)
+            ipu_strategy.set_precision_config(enable_fp16=True)
+            program = paddle.static.IpuCompiledProgram(
+                inference_program,
+                ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+        else:
+            program = inference_program
+
+        feed = self.feed_fp16 if run_ipu else self.feed_fp32
+        result = []
+        for i in range(10):
+            feed["in_0"] += np.array([1.1 * i]).astype(feed["in_0"].dtype)
+            out = exe.run(program, feed=feed, fetch_list=[fetch_targets])
+            result.append(out)
+
+        return np.array(result)
+
+    def test_base(self):
+        self._test_save()
+        cpu_res = self._test_load(False)
+        ipu_res = self._test_load(True).astype(np.float32)
+
+        self.assertTrue(
+            np.allclose(
+                cpu_res, ipu_res, rtol=self.rtol_fp16, atol=self.atol_fp16))
+
+        shutil.rmtree(self.attrs['path'], True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
index d5be8ae0cf7752..01a56fd14be04b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,85 +26,92 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[10, 20]).astype('float32'),
-            "y": np.array([1, 3, 5]).astype('int32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[10, 20])
+        y = np.array([1, 3, 5])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='int32')
+
                 out = paddle.fluid.layers.gather(x, index=y, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[100]).astype('float32'),
-            "y": np.array([1, 3, 5]).astype('int32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[100])
+        y = np.array([1, 3, 5])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
index ca8c0935d782cc..602289f3f19041 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,80 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_atol(self):
-        self.atol = 1e-3
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"approximate": False}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.gelu(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
-@unittest.skip('approximate=True is not supported')
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_atol(self):
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 2e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
         self.attrs = {"approximate": True}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
new file mode 100644
index 00000000000000..05a37dcb3d5147
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
@@ -0,0 +1,140 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.randn(3, 4, 5)
+        y = np.random.randn(3, 4, 5)
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+                y = paddle.static.data(
+                    name=self.feed_list[1],
+                    shape=self.feed_shape[1],
+                    dtype='float32')
+
+                out = paddle.fluid.layers.greater_than(x, y, **self.attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.ones([10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase2(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.zeros([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase3(TestBase):
+    def set_data_feed(self):
+        x = np.zeros([1, 10])
+        y = np.ones([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
index eb644c2c6670f5..102e764cb2f172 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,43 +26,49 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 8, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 3e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 4e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 8, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 8,
             "epsilon": 1e-05,
             "data_layout": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -78,62 +78,68 @@ def _test_base(self, run_ipu=True):
                     bias = paddle.ParamAttr(trainable=True)
                     out = paddle.fluid.layers.nn.group_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
                 else:
-                    scale = True
-                    bias = True
                     out = paddle.fluid.layers.nn.group_norm(
-                        x, param_attr=scale, bias_attr=bias, **self.attrs)
+                        x, param_attr=True, bias_attr=True, **self.attrs)
 
                 if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
                     fetch_list = [loss.name]
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            if mode > ExecutionMode.IPU_FP32 and self.is_training:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 4,
             "epsilon": 1e-05,
@@ -147,11 +153,15 @@ def set_training(self):
         self.epoch = 10
 
 
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
 class TestTrainCase2(TestBase):
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 7e-4
+        self.rtol = 1e-6
+        self.atol_fp16 = 4e-3
+        self.rtol_fp16 = 1e-3
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "groups": 4,
             "epsilon": 1e-05,
@@ -163,7 +173,5 @@ def set_training(self):
         self.epoch = 10
 
 
-# not support `group_norm(x, param_attr=False, bias_attr=False, **self.attrs)`
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
similarity index 78%
rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
rename to python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
index 0a331d804545d4..33a63a80e3bc0d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
@@ -12,59 +12,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
-import shutil
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
-paddle.enable_static()
-
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed = {"in_0": data.astype(np.float32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'sgd'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
     def _test_save(self):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = self.SEED
         startup_prog.random_seed = self.SEED
-        generator = fluid.unique_name.UniqueNameGenerator()
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
         self.full_name = '/'.join(
-            [self.attrs['path'], self.attrs['model_name']])
+            [self.attrs['path'].name, self.attrs['model_name']])
 
-        with fluid.unique_name.guard(generator):
-            with fluid.scope_guard(scope):
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
                 with paddle.static.program_guard(main_prog, startup_prog):
                     x = paddle.static.data(
                         name=self.feed_list[0],
@@ -88,16 +88,16 @@ def _test_save(self):
                         elif self.attrs['opt_type'] == 'lamb':
                             lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
                             lamb.minimize(loss)
-                    fetch_list = [loss.name]
+                fetch_list = [loss.name]
 
                 place = paddle.IPUPlace()
                 exe = paddle.static.Executor(place)
                 exe.run(startup_prog)
 
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     is_training=self.attrs['is_training'])
-                program = compiler.IPUCompiledProgram(
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(
                         self.feed_list, fetch_list)
 
@@ -125,8 +125,8 @@ def _test_load(self, run_ipu):
             feed_list = feed_target_names
             fetch_list = [fetch_targets[0].name]
             ipu_strategy = paddle.static.IpuStrategy()
-            ipu_strategy.SetGraphConfig(is_training=False)
-            program = compiler.IPUCompiledProgram(
+            ipu_strategy.set_graph_config(is_training=False)
+            program = paddle.static.IpuCompiledProgram(
                 inference_program,
                 ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
         else:
@@ -134,7 +134,7 @@ def _test_load(self, run_ipu):
 
         tmp = exe.run(program, feed=self.feed, fetch_list=[fetch_targets])
 
-        return tmp
+        return np.array(tmp)
 
     def test_base(self):
         self._test_save()
@@ -142,27 +142,26 @@ def test_base(self):
         ipu_res = self._test_load(True)
 
         self.assertTrue(np.allclose(cpu_res, ipu_res, atol=self.atol))
-
-        shutil.rmtree(self.attrs['path'], True)
+        self.attrs['path'].cleanup()
 
 
 class TestAdam(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'adam'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
 
 class TestLamb(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'lamb'
-        self.attrs['path'] = 'model'
+        self.attrs['path'] = tempfile.TemporaryDirectory()
         self.attrs['model_name'] = 'test'
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
index ee9cd875cf2988..ed8f3950ace82c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,39 +26,45 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"epsilon": 1e-05}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -74,58 +74,64 @@ def _test_base(self, run_ipu=True):
                     bias = paddle.ParamAttr(trainable=True)
                     out = paddle.fluid.layers.nn.instance_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
                 else:
-                    scale = True
-                    bias = True
                     out = paddle.fluid.layers.nn.instance_norm(
-                        x, param_attr=scale, bias_attr=bias, **self.attrs)
+                        x, param_attr=True, bias_attr=True, **self.attrs)
 
                 if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
                     fetch_list = [loss.name]
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res)
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            if mode > ExecutionMode.IPU_FP32 and self.is_training:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestTrainCase1(TestBase):
@@ -134,7 +140,5 @@ def set_training(self):
         self.epoch = 10
 
 
-# not support `instance_norm(x, param_attr=False, bias_attr=False, **self.attrs)`
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py
deleted file mode 100644
index beab68553d723c..00000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuShard(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_stage : no
-
-        with paddle.fluid.ipu_shard(ipu_stage=1):
-            c = b + 1  # scale, ipu_stage : 1
-            with paddle.fluid.ipu_shard(ipu_stage=2):
-                d = c * 2  # scale, ipu_stage : 2
-            with paddle.fluid.ipu_shard(ipu_stage=3):
-                e = d + 3  # scale, ipu_stage : 3
-                with paddle.fluid.ipu_shard(ipu_stage=1):
-                    e = e + 3  # scale, ipu_stage : 1
-                    with paddle.fluid.ipu_shard(ipu_stage=2):
-                        e = e + 3  # scale, ipu_stage : 2
-
-        with paddle.fluid.ipu_shard(ipu_stage=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
-
-        with paddle.fluid.ipu_shard(ipu_stage=2):
-            g = f - 1  # scale, ipu_stage : 2
-
-        h = g + 1  # scale, ipu_stage : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_stage"):
-                ipu_index_list.append(op.desc.attr("ipu_stage"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py
deleted file mode 100644
index 48ab046deb3703..00000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-sys.path.append("..")
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuPlace(unittest.TestCase):
-    def test_ipu_place(self):
-        num_devices = fluid.core.get_ipu_device_count()
-        self.assertGreater(num_devices, 0)
-
-        for i in range(num_devices):
-            place = paddle.IPUPlace()
-            p = fluid.core.Place()
-            p.set_place(place)
-            self.assertTrue(p.is_ipu_place())
-
-    def test_ipu_set_device(self):
-        num_devices = fluid.core.get_ipu_device_count()
-        self.assertGreater(num_devices, 0)
-
-        for i in range(num_devices):
-            paddle.set_device('ipu')
-            device = paddle.get_device()
-            self.assertTrue(device == "ipus:{{0-{}}}".format(num_devices - 1))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py
deleted file mode 100644
index 368556d8b2f2d4..00000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuShard(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_index : no
-
-        with paddle.fluid.ipu_shard(ipu_index=1):
-            c = b + 1  # scale, ipu_index : 1
-            with paddle.fluid.ipu_shard(ipu_index=2):
-                d = c * 2  # scale, ipu_index : 2
-            with paddle.fluid.ipu_shard(ipu_index=3):
-                e = d + 3  # scale, ipu_index : 3
-                with paddle.fluid.ipu_shard(ipu_index=1):
-                    e = e + 3  # scale, ipu_index : 1
-                    with paddle.fluid.ipu_shard(ipu_index=2):
-                        e = e + 3  # scale, ipu_index : 2
-
-        with paddle.fluid.ipu_shard(ipu_index=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
-
-        with paddle.fluid.ipu_shard(ipu_index=2):
-            g = f - 1  # scale, ipu_index : 2
-
-        h = g + 1  # scale, ipu_index : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_index"):
-                ipu_index_list.append(op.desc.attr("ipu_index"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
new file mode 100644
index 00000000000000..026b19eccf1872
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
@@ -0,0 +1,112 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuShard(unittest.TestCase):
+    def _test(self):
+        # build graph
+        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+        b = a + 2  # scale : scale * x + bias, ipu_index : no
+
+        with paddle.static.ipu_shard_guard(index=1):
+            c = b + 1  # scale, ipu_index : 1
+            with paddle.static.ipu_shard_guard(index=2):
+                d = c * 2  # scale, ipu_index : 2
+            with paddle.static.ipu_shard_guard(index=3):
+                e = d + 3  # scale, ipu_index : 3
+                with paddle.static.ipu_shard_guard(index=1):
+                    e = e + 3  # scale, ipu_index : 1
+                    with paddle.static.ipu_shard_guard(index=2):
+                        e = e + 3  # scale, ipu_index : 2
+
+        with paddle.static.ipu_shard_guard(index=1):
+            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
+
+        with paddle.static.ipu_shard_guard(index=2):
+            g = f - 1  # scale, ipu_index : 2
+
+        h = g + 1  # scale, ipu_index : no
+
+        ipu_index_list = []
+        main_prog = paddle.static.default_main_program()
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_index"):
+                ipu_index_list.append(op.desc.attr("ipu_index"))
+
+        return ipu_index_list
+
+    def test_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
+        self.assertTrue(
+            np.allclose(
+                ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuPipeline(unittest.TestCase):
+    def _test(self):
+        # build graph
+        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+        b = a + 2  # scale : scale * x + bias, ipu_stage : no
+
+        with paddle.static.ipu_shard_guard(stage=1):
+            c = b + 1  # scale, ipu_stage : 1
+            with paddle.static.ipu_shard_guard(stage=2):
+                d = c * 2  # scale, ipu_stage : 2
+            with paddle.static.ipu_shard_guard(stage=3):
+                e = d + 3  # scale, ipu_stage : 3
+                with paddle.static.ipu_shard_guard(stage=1):
+                    e = e + 3  # scale, ipu_stage : 1
+                    with paddle.static.ipu_shard_guard(stage=2):
+                        e = e + 3  # scale, ipu_stage : 2
+
+        with paddle.static.ipu_shard_guard(stage=1):
+            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
+
+        with paddle.static.ipu_shard_guard(stage=2):
+            g = f - 1  # scale, ipu_stage : 2
+
+        h = g + 1  # scale, ipu_stage : no
+
+        ipu_index_list = []
+        main_prog = paddle.static.default_main_program()
+        for op in main_prog.global_block().ops:
+            if op.desc.has_attr("ipu_stage"):
+                ipu_index_list.append(op.desc.attr("ipu_stage"))
+
+        return ipu_index_list
+
+    def test_ipu_shard(self):
+        ipu_index_list = self._test()
+        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
+
+        self.assertTrue(
+            np.allclose(
+                ipu_index_list, expected_ipu_index_list, atol=0))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
deleted file mode 100644
index afeec9ee1b6fa7..00000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-
-paddle.enable_static()
-SEED = 2021
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestConvNet(unittest.TestCase):
-    def test_training(self):
-        ipu_strategy = paddle.static.IpuStrategy()
-
-        assert ipu_strategy.num_ipus == 1, "Default num_ipus must be 1"
-        assert ipu_strategy.is_training == True, "Default is_training is True"
-        assert ipu_strategy.enable_pipelining == False, \
-            "Default enable_pipelining is False"
-        assert ipu_strategy.enable_manual_shard == False, \
-            "Default enable_manual_shard is False"
-
-        ipu_strategy.SetGraphConfig(
-            num_ipus=2, is_training=False, enable_manual_shard=True)
-        ipu_strategy.SetPipeliningConfig(enable_pipelining=True)
-        assert ipu_strategy.num_ipus == 2, "Set num_ipus Failed"
-
-        assert ipu_strategy.is_training == False, "Set is_training Failed"
-
-        assert ipu_strategy.enable_pipelining == True, \
-            "Set enable_pipelining Failed"
-
-        assert ipu_strategy.enable_manual_shard == True, \
-            "Set enable_manual_shard Failed"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
new file mode 100644
index 00000000000000..f120f5594914e8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -0,0 +1,72 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.static
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestIpuStrategy(unittest.TestCase):
+    def test_set_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
+        for option_name in all_option_names:
+            option = ipu_strategy._ipu_strategy.get_option(option_name)
+            option_type = option['type']
+            option_value = option['value']
+            if option_type in ['double']:
+                set_value = option_value + 0.5
+            elif option_type == 'uint64':
+                set_value = option_value + 1
+            elif option_type == 'bool':
+                set_value = not option_value
+            else:
+                continue
+            ipu_strategy.set_options({option_name: set_value})
+            new_value = ipu_strategy.get_option(option_name)
+            assert new_value == set_value, f"set {option_name} to {set_value} failed"
+
+    def test_set_string_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        options = {
+            'cache_path': 'paddle_cache',
+            'log_dir': 'paddle_log',
+            'partials_type_matmuls': 'half',
+            'partials_type_matmuls': 'float',
+        }
+        ipu_strategy.set_options(options)
+        for k, v in options.items():
+            assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
+
+    def test_set_other_options(self):
+        ipu_strategy = paddle.static.IpuStrategy()
+        options = {}
+        options['dot_checks'] = ['0', '1', '2', '3']
+        options['engine_options'] = {
+            'debug.allowOutOfMemory': 'true',
+            'autoReport.directory': 'path',
+            'autoReport.all': 'true'
+        }
+        for k, v in options.items():
+            ipu_strategy.set_options({k: v})
+            assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
index 196f94b68f94a0..a52946bba1567b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,44 +26,52 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 1,
             "epsilon": 1e-05,
         }
+        self.optimizer = None
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
 
                 if self.is_training:
                     ch = self.feed_shape[0][1]
@@ -80,33 +82,38 @@ def _test_base(self, run_ipu=True):
                     out = paddle.fluid.layers.nn.layer_norm(
                         conv1, param_attr=scale, bias_attr=bias, **self.attrs)
                 else:
-                    # scale = True
-                    # bias = True
                     scale = self.attrs['scale']
                     bias = self.attrs['shift']
                     out = paddle.fluid.layers.nn.layer_norm(
                         x, param_attr=scale, bias_attr=bias, **self.attrs)
+                loss = paddle.mean(out)
 
-                if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
+                fetch_list = [loss.name]
 
-            if run_ipu:
+                if self.is_training:
+                    optimizer = None
+                    if self.optimizer == 'sgd':
+                        optimizer = paddle.optimizer.SGD(learning_rate=1e-2)
+                    elif self.optimizer == 'adam':
+                        optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
+                    elif self.optimizer == 'lamb':
+                        optimizer = paddle.optimizer.Lamb(
+                            learning_rate=1e-2, lamb_weight_decay=0.0)
+                    if optimizer is not None:
+                        optimizer.minimize(loss)
+
+            if exec_mode:
                 place = paddle.IPUPlace()
             else:
                 place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -116,12 +123,14 @@ def _test_base(self, run_ipu=True):
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=self.feed_fp32,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program,
+                                 feed=self.feed_fp32,
+                                 fetch_list=fetch_list)
                 return result[0]
 
     def test_base(self):
@@ -137,7 +146,7 @@ def test_base(self):
 
 @unittest.skip('raise error')
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": False,
             "shift": True,
@@ -148,7 +157,7 @@ def set_attrs(self):
 
 @unittest.skip('raise error')
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": False,
@@ -158,18 +167,28 @@ def set_attrs(self):
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 2,
             "epsilon": 1e-05,
         }
+        self.optimizer = None
 
 
 class TestTrainCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": True,
+            "shift": True,
+            "begin_norm_axis": 1,
+            "epsilon": 1e-05
+        }
+        self.optimizer = 'sgd'
+
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 1e-6
 
     def set_training(self):
         self.is_training = True
@@ -178,15 +197,34 @@ def set_training(self):
 
 class TestTrainCase2(TestBase):
     def set_atol(self):
-        self.atol = 1e-3
+        self.atol = 5e-4
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": True,
             "shift": True,
             "begin_norm_axis": 2,
-            "epsilon": 1e-05,
+            "epsilon": 1e-05
+        }
+        self.optimizer = 'adam'
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 10
+
+
+class TestTrainCase3(TestBase):
+    def set_atol(self):
+        self.atol = 5e-3
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": True,
+            "shift": True,
+            "begin_norm_axis": 2,
+            "epsilon": 1e-05
         }
+        self.optimizer = 'lamb'
 
     def set_training(self):
         self.is_training = True
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
index dc3cab6ac5e114..fad7516e442a72 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
@@ -16,15 +16,9 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
 import paddle.nn.functional as F
-
-paddle.enable_static()
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -33,72 +27,81 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+        self.feed_list = list(self.feed_fp32.keys())
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = F.log_softmax(x, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
new file mode 100644
index 00000000000000..3f8472890d03e6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
@@ -0,0 +1,97 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 20, 30528])
+        self.feed = {"in_0": data.astype('bool')}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype="bool")
+
+                out = paddle.fluid.layers.logical_not(x)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).astype(np.int32)
+
+        self.check(output_dict, check_shape=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
index 31b0c99603c3f7..4a877ddce4e3ce 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,16 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
+        self.feed_cpu = {"x": data.astype(np.int64)}
+        self.feed_ipu = {"x": data.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "size": [128, 16],
             "is_sparse": False,
@@ -50,33 +53,20 @@ def set_attrs(self):
             "dtype": 'float32'
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-
-        if run_ipu:
-            self.feed = {
-                "x": np.array(
-                    [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int32)
-            }
-        else:
-            self.feed = {
-                "x": np.array(
-                    [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int64)
-            }
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        self.set_feed_attr()
-
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='int64')
+
                 out = paddle.fluid.layers.embedding(x, **self.attrs)
 
                 if self.is_training:
@@ -87,47 +77,61 @@ def _test_base(self, run_ipu=True):
                 else:
                     fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_cpu
+            if exec_mode > ExecutionMode.CPU_FP32:
+                feed = self.feed_ipu
+
             if self.is_training:
                 result = []
                 for _ in range(self.epoch):
                     loss_res = exe.run(program,
-                                       feed=self.feed,
+                                       feed=feed,
                                        fetch_list=fetch_list)
                     result.append(loss_res[0])
                 return np.array(result)
             else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
                 return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
+                                                  self.is_training):
+                break
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestTrainCase1(TestBase):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
     def set_training(self):
         self.is_training = True
         self.epoch = 10
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
new file mode 100644
index 00000000000000..da8048fb3205e5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
@@ -0,0 +1,141 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
+        self.feed_cpu = {"x": x.astype(np.int64)}
+        self.feed_ipu = {"x": x.astype(np.int32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "num_embeddings": 128,
+            "embedding_dim": 16,
+            "sparse": False,
+            "padding_idx": -1,
+            "weight_attr": None
+        }
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int64')
+
+                embedding = paddle.nn.Embedding(**self.attrs)
+                out = embedding(x)
+
+                if self.is_training:
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
+                    fetch_list = [loss.name]
+                else:
+                    fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_cpu
+            if exec_mode > ExecutionMode.CPU_FP32:
+                feed = self.feed_ipu
+
+            if self.is_training:
+                result = []
+                for _ in range(self.epoch):
+                    loss_res = exe.run(program,
+                                       feed=feed,
+                                       fetch_list=fetch_list)
+                    result.append(loss_res[0])
+                return np.array(result)
+            else:
+                result = exe.run(program, feed=feed, fetch_list=fetch_list)
+                return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
+                                                  self.is_training):
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestTrainCase1(TestBase):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 10
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
rename to python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
index 38b91785aeec8c..58f018e2ae649a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
@@ -19,7 +19,7 @@
 import sys
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
+import paddle.static
 from paddle.optimizer.lr import LRScheduler
 
 paddle.enable_static()
@@ -71,8 +71,8 @@ def _test(self, run_ipu=True):
                 feed_list = [image.name]
                 fetch_list = [loss.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=True)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
index c6702b92ab969e..6929ded6ebf90e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,85 +26,93 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[3, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[20, 30])
+        y = np.random.uniform(size=[30, 20])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": False,
             "transpose_y": False,
             "alpha": 1.0,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.matmul(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -119,55 +121,64 @@ def set_attrs(self):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
             "alpha": 3.14,
         }
 
+    def set_atol(self):
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
 
 class TestCase3(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[5, 4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[5, 4, 3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[5, 4, 3, 2])
+        y = np.random.uniform(size=[5, 4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase4(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[4, 3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 3, 2])
+        y = np.random.uniform(size=[4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase5(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[3, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 class TestCase6(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
 
 @unittest.skip("not supported")
 class TestCase6_2(TestCase6):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -176,27 +187,36 @@ def set_attrs(self):
 
 
 class TestCase7(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 1]).astype('float32'),
-            "y": np.random.uniform(size=[1, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 12, 128, 64])
+        y = np.random.uniform(size=[1, 12, 128, 64])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": False, "transpose_y": True, "alpha": 0.125}
+
+
+class TestCase8(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 1])
+        y = np.random.uniform(size=[1, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
 @unittest.skip("not supported")
-class TestCase7_2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3]).astype('float32'),
-            "y": np.random.uniform(size=[2]).astype('float32'),
-        }
-        # equal to
-        # self.feed = {
-        #     "x": np.random.uniform(size=[3, 1]).astype('float32'),
-        #     "y": np.random.uniform(size=[1, 2]).astype('float32'),
-        # }
+class TestCase8_2(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[2])
 
-    def set_attrs(self):
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
         self.attrs = {
             "transpose_x": True,
             "transpose_y": True,
@@ -205,12 +225,12 @@ def set_attrs(self):
 
 
 @unittest.skip("dim > 4 is not supported")
-class TestCase8(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'),
-            "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'),
-        }
+class TestCase9(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[6, 5, 4, 2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
new file mode 100644
index 00000000000000..9f1c115403adf2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
@@ -0,0 +1,186 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": False, "transpose_y": False}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+                y = paddle.static.data(
+                    name=self.feed_list[1],
+                    shape=self.feed_shape[1],
+                    dtype='float32')
+
+                out = paddle.matmul(x, y, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "transpose_x": True,
+            "transpose_y": True,
+        }
+
+
+class TestCase3(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[5, 4, 2, 3])
+        y = np.random.uniform(size=[5, 4, 3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase4(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[4, 3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase5(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase6(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+@unittest.skip("not supported")
+class TestCase6_2(TestCase6):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3])
+        y = np.random.uniform(size=[3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+    def set_op_attrs(self):
+        self.attrs = {"transpose_x": True, "transpose_y": True}
+
+
+class TestCase7(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 1])
+        y = np.random.uniform(size=[1, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+@unittest.skip("dim > 4 is not supported")
+class TestCase8(TestBase):
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'),
+            "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'),
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
index f04d712755dead..b9dd7358b79550 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
@@ -16,13 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,97 +26,79 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
+    @property
+    def fp16_enabled(self):
+        return True
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
-        self.attrs['axis'] = None
-        self.attrs['keepdim'] = False
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
-                out = paddle.mean(x, **self.attrs)
 
-                fetch_list = [out.name]
+                out = paddle.fluid.layers.mean(x)
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-class TestCase1(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 1
-        self.attrs['keepdim'] = False
-
-
-class TestCase2(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 2
-        self.attrs['keepdim'] = False
-
-
-class TestCase3(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = 2
-        self.attrs['keepdim'] = True
-
-
-class TestCase4(TestBase):
-    def set_attrs(self):
-        self.attrs = {}
-        self.attrs['axis'] = None
-        self.attrs['keepdim'] = True
+        self.check(output_dict)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
similarity index 86%
rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
rename to python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
index e1ed7603ed6272..7e702399640021 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import unittest
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
+import paddle.static
 
 paddle.enable_static()
 SEED = 2021
@@ -28,7 +27,7 @@
                  "core is not compiled with IPU")
 class TestCastNet(unittest.TestCase):
     def _test(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = SEED
@@ -37,14 +36,14 @@ def _test(self, run_ipu=True):
 
         np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 image = paddle.static.data(
                     name='image', shape=[1, 3, 10, 10], dtype='float32')
-                with fluid.ipu_shard(ipu_index=0):
+                with paddle.static.ipu_shard_guard(index=0):
                     conv1 = paddle.static.nn.conv2d(
                         image, num_filters=3, filter_size=3, bias_attr=False)
-                with fluid.ipu_shard(ipu_index=1):
+                with paddle.static.ipu_shard_guard(index=1):
                     conv2 = paddle.static.nn.conv2d(
                         conv1, num_filters=3, filter_size=3, bias_attr=False)
                     loss = paddle.mean(conv2)
@@ -60,9 +59,10 @@ def _test(self, run_ipu=True):
                 feed_list = [image.name]
                 fetch_list = [loss.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     num_ipus=2, is_training=False, enable_manual_shard=True)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_pipelining_config(enable_pipelining=False)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
index 78a2589d9aca59..7a9135626df790 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,90 +26,98 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[2, 5]).astype('float32'),
-            "y": np.random.uniform(size=[5, 3]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 5])
+        y = np.random.uniform(size=[5, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "x_num_col_dims": 1,
             "y_num_col_dims": 1,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.mul(x, y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2, 5]).astype('float32'),
-            "y": np.random.uniform(size=[5, 3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 2, 5])
+        y = np.random.uniform(size=[5, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "x_num_col_dims": 2,
             "y_num_col_dims": 1,
@@ -123,13 +125,13 @@ def set_attrs(self):
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 4, 2, 9]).astype('float32'),
-            "y": np.random.uniform(size=[3, 6, 1, 2, 3]).astype('float32'),
-        }
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 4, 2, 9])
+        y = np.random.uniform(size=[3, 6, 1, 2, 3])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'x_num_col_dims': 2,
             'y_num_col_dims': 2,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
new file mode 100644
index 00000000000000..1cc10da3d73444
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
@@ -0,0 +1,165 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+
+    def set_data_feed(self):
+        self.feed = {
+            "image": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'sgd',
+            "weight_decay": 0.0,
+            "loss_scaling": 1.0,
+        }
+
+    def _test_optimizer(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+        np.random.seed(self.SEED)
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                image = paddle.static.data(
+                    name='image', shape=[1, 3, 10, 10], dtype='float32')
+                conv1 = paddle.static.nn.conv2d(
+                    image, num_filters=3, filter_size=3, bias_attr=False)
+                loss = paddle.mean(conv1)
+
+                weight_decay = self.attrs['weight_decay']
+                opt = paddle.optimizer.SGD(learning_rate=1e-1,
+                                           weight_decay=weight_decay)
+                if self.attrs['optimizer'] == 'adam':
+                    opt = paddle.optimizer.Adam(
+                        learning_rate=1e-1, weight_decay=weight_decay)
+                elif self.attrs['optimizer'] == 'lamb':
+
+                    opt = paddle.optimizer.Lamb(
+                        learning_rate=1e-1, lamb_weight_decay=weight_decay)
+                opt.minimize(loss)
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = [image.name]
+                fetch_list = [loss.name]
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=True)
+                ipu_strategy.loss_scaling = self.attrs["loss_scaling"]
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
+                                                                  fetch_list)
+            else:
+                program = main_prog
+
+            result = []
+            for epoch in range(100):
+                loss_res = exe.run(program, feed=self.feed, fetch_list=[loss])
+                result.append(loss_res)
+
+            return np.array(result)
+
+    def test(self):
+        # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
+        ipu_loss = self._test_optimizer(True).flatten()
+        cpu_loss = self._test_optimizer(False).flatten()
+
+        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=self.atol))
+
+
+@unittest.skip('do not support L2 regularization')
+class TestSGD(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'sgd',
+            "weight_decay": 0.1,
+            "loss_scaling": 2.0,
+        }
+
+
+@unittest.skip('do not support L2 regularization')
+class TestAdamCase1(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.1,
+            "loss_scaling": 3.0,
+        }
+
+
+class TestAdamCase2(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "loss_scaling": 4.0,
+        }
+
+
+@unittest.skip('seems cpu output wrong')
+class TestLambCase1(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.0,
+            "loss_scaling": 5.0,
+        }
+
+
+@unittest.skip('seems cpu output wrong')
+class TestLamb(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.1,
+            "loss_scaling": 6.0,
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
index e81591ad683680..4288b82832edef 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,23 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "pool_size": 3,
             "pool_type": 'avg',
@@ -60,53 +56,59 @@ def set_attrs(self):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pool2d(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
index a7c45c6686f10e..911a163b8aa9c9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,23 +26,25 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "pool_size": 3,
             "pool_type": 'max',
@@ -60,120 +56,126 @@ def set_attrs(self):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pool2d(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_size'] = 3
 
 
 class TestCase1_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_size'] = [3, 1]
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_stride'] = 2
 
 
 class TestCase2_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_stride'] = [2, 1]
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = [1, 1]
 
 
 class TestCase3_2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = [1, 1, 2, 2]
 
 
 @unittest.skip('auto_pad is not currently supported')
 class TestCase3_3(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = 'VALID'
 
 
 @unittest.skip('auto_pad is not currently supported')
 class TestCase3_4(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['pool_padding'] = 'SAME'
 
 
 class TestCase4(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['global_pooling'] = True
 
 
 class TestCase5(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['ceil_mode'] = True
 
 
 class TestCase6(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['exclusive'] = False
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
index 5059de7ba77b1b..b3562d722c4e6a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,124 +26,146 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"factor": 2.0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pow(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.array([2.0]).astype('float32'),
+    def set_data_feed(self):
+        data1 = np.random.uniform(size=[1, 3, 2, 2])
+        data2 = np.array([2.0])
+
+        self.feed_fp32 = {
+            "x": data1.astype(np.float32),
+            "y": data2.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": data1.astype(np.float16),
+            "y": data2.astype(np.float16)
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 factor = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
new file mode 100644
index 00000000000000..c9454e5945f7d6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
@@ -0,0 +1,143 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype=self.feed_dtype[0])
+                out = paddle.fluid.layers.conv2d(
+                    x, num_filters=3, filter_size=3)
+                out = paddle.fluid.layers.Print(out, **self.attrs)
+
+                if self.is_training:
+                    loss = paddle.mean(out)
+                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
+                    adam.minimize(loss)
+                    fetch_list = [loss.name]
+                else:
+                    fetch_list = [out.name]
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            if self.is_training:
+                result = []
+                for _ in range(self.epoch):
+                    loss_res = exe.run(program,
+                                       feed=self.feed,
+                                       fetch_list=fetch_list)
+                    result.append(loss_res[0])
+                return np.array(result)
+            else:
+                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+                return result[0]
+
+    def test(self):
+        res0 = self._test_base(False)
+        res1 = self._test_base(True)
+
+        self.assertTrue(
+            np.allclose(
+                res0.flatten(), res1.flatten(), atol=self.atol))
+
+        self.assertTrue(res0.shape == res1.shape)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"message": "input_data"}
+
+
+class TestTrainCase1(TestBase):
+    def set_op_attrs(self):
+        # "forward" : print forward
+        # "backward" : print forward and backward
+        # "both": print forward and backward
+        self.attrs = {"message": "input_data2", "print_phase": "both"}
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 2
+
+
+@unittest.skip("attrs are not supported")
+class TestCase2(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "first_n": 10,
+            "summarize": 10,
+            "print_tensor_name": True,
+            "print_tensor_type": True,
+            "print_tensor_shape": True,
+            "print_tensor_layout": True,
+            "print_tensor_lod": True
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
index ac8ad08e8b28c0..929ee51b650946 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,125 +26,137 @@ class TestMean(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.init_op()
+        self.set_test_op()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_mean
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 out = self.op(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def run_test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-    def set_feed0(self):
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(size=[2, 4]).astype(np.float32)
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+    def set_data_feed0(self):
+        data = np.random.uniform(size=[2, 4])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
         self.set_feed_attr()
 
-    def set_feed1(self):
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(size=[2, 2, 2]).astype(np.float32)
+    def set_data_feed1(self):
+        data = np.random.uniform(size=[2, 2, 2])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
         self.set_feed_attr()
 
-    def set_attr0(self):
+    def set_op_attr0(self):
         self.attrs = {}
         self.attrs['dim'] = None
         self.attrs['keep_dim'] = False
 
     def test_case0(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.run_test_base()
 
     def test_case1(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = 0
         self.run_test_base()
 
     def test_case2(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = -1
         self.run_test_base()
 
     def test_case3(self):
-        self.set_feed0()
-        self.set_attr0()
+        self.set_data_feed0()
+        self.set_op_attr0()
         self.attrs['dim'] = 1
         self.run_test_base()
 
     def test_case4(self):
-        self.set_feed0()
+        self.set_data_feed0()
         self.attrs = {}
         self.attrs['dim'] = 1
         self.attrs['keep_dim'] = True
         self.run_test_base()
 
     def test_case5(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [1, 2]
         self.attrs['keep_dim'] = False
         self.run_test_base()
 
     def test_case6(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [0, 1]
         self.attrs['keep_dim'] = False
         self.run_test_base()
 
     def test_case7(self):
-        self.set_feed1()
+        self.set_data_feed1()
         self.attrs = {}
         self.attrs['dim'] = [0, 1]
         self.attrs['keep_dim'] = True
@@ -158,22 +164,22 @@ def test_case7(self):
 
 
 class TestMax(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_max
 
 
 class TestMin(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_min
 
 
 class TestProd(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_prod
 
 
 class TestSum(TestMean):
-    def init_op(self):
+    def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_sum
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
index f312b7b69ad79b..9ddf5c7537fdcd 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,76 +26,84 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "shape": [30, 10],
             "inplace": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 add = paddle.fluid.layers.elementwise_add(x, x)
                 out = paddle.fluid.layers.reshape(add, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
index 5163838bc0cd63..119771931701c9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
@@ -16,13 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,82 +26,92 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([2, 4, 6])
+    @property
+    def fp16_enabled(self):
+        return True
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 4, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [6, 8]
         self.attrs['inplace'] = False
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 out = paddle.fluid.layers.reshape(x=x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [2, 3, -1, 2]
         self.attrs['inplace'] = False
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['shape'] = [-1, 0, 3, 2]
         self.attrs['inplace'] = False
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
similarity index 58%
rename from python/paddle/fluid/tests/unittests/ipu/test_save_load.py
rename to python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index 24bb8e111842cb..3a694873062080 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -12,55 +12,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
-import shutil
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
-paddle.enable_static()
-
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
 
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
-        self.feed_list = list(self.feed.keys())
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'sgd'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
     def _test_base(self, save_otherwise_load):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = self.SEED
         startup_prog.random_seed = self.SEED
-        generator = fluid.unique_name.UniqueNameGenerator()
+        generator = paddle.fluid.unique_name.UniqueNameGenerator()
 
-        with fluid.unique_name.guard(generator):
-            with fluid.scope_guard(scope):
+        with paddle.fluid.unique_name.guard(generator):
+            with paddle.static.scope_guard(scope):
                 with paddle.static.program_guard(main_prog, startup_prog):
                     x = paddle.static.data(
                         name=self.feed_list[0],
@@ -91,12 +88,17 @@ def _test_base(self, save_otherwise_load):
                 exe.run(startup_prog)
 
                 if not save_otherwise_load:
-                    paddle.static.load(main_prog, "model/model")
+                    paddle.static.load(main_prog, self.attrs['model_path'].name)
 
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
+                ipu_strategy.set_graph_config(
                     is_training=self.attrs['is_training'])
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_precision_config(
+                    enable_fp16=self.attrs['enable_fp16'])
+                ipu_strategy.set_options({
+                    'save_per_n_step': self.attrs['save_at_step']
+                })
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(
                         self.feed_list, fetch_list)
 
@@ -104,16 +106,17 @@ def _test_base(self, save_otherwise_load):
                 run_steps = self.attrs['steps'] if save_otherwise_load \
                     else self.attrs['steps'] - self.attrs['save_at_step']
 
+                feed = self.feed_fp16 if self.attrs[
+                    'enable_fp16'] else self.feed_fp32
                 for i in range(run_steps):
-                    tmp = exe.run(program,
-                                  feed=self.feed,
-                                  fetch_list=fetch_list)
+                    tmp = exe.run(program, feed=feed, fetch_list=fetch_list)
 
                     # currently, we update opt state every sess.run,
                     # will optimize
                     if save_otherwise_load and \
                         i == self.attrs['save_at_step'] - 1:
-                        paddle.static.save(main_prog, "model/model")
+                        paddle.static.save(main_prog,
+                                           self.attrs['model_path'].name)
 
                     if save_otherwise_load and i >= self.attrs['save_at_step']:
                         result.append(tmp)
@@ -129,25 +132,65 @@ def test_base(self):
         self.assertTrue(
             np.allclose(
                 res0.flatten(), res1.flatten(), atol=self.atol))
-        shutil.rmtree("model", True)
+        self.attrs['model_path'].cleanup()
 
 
 class TestAdam(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'adam'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
 
 class TestLamb(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'lamb'
+        self.attrs['enable_fp16'] = False
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestSGDFP16(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'sgd'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestAdamFP16(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['steps'] = 100
+        self.attrs['save_at_step'] = 20
+        self.attrs['is_training'] = True
+        self.attrs['opt_type'] = 'adam'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+
+@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
+class TestLambFP16(TestBase):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
         self.attrs['is_training'] = True
         self.attrs['opt_type'] = 'lamb'
+        self.attrs['enable_fp16'] = True
+        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
index 6ad2a89a738b70..49714eba8d4d1e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,80 +26,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.0,
             "bias_after_scale": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.scale(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 5.0,
             "bias": 0.0,
@@ -114,7 +116,7 @@ def set_attrs(self):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.5,
@@ -123,7 +125,16 @@ def set_attrs(self):
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
+        self.attrs = {
+            "scale": 5.0,
+            "bias": 0.7,
+            "bias_after_scale": True,
+        }
+
+
+class TestCase4(TestBase):
+    def set_op_attrs(self):
         self.attrs = {
             "scale": 1.0,
             "bias": 0.0,
@@ -131,59 +142,66 @@ def set_attrs(self):
         }
 
 
-class TestCase4(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 3, 10, 10]).astype('float32'),
-            "y": np.array([3.0]).astype('float32'),
-        }
+class TestCase5(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 3, 10, 10])
+        y = np.array([3.0])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "bias": 0.0,
             "bias_after_scale": True,
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.scale(x, scale=y, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
index 93945b98ef0a26..9a18922f35331b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
@@ -16,13 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,36 +26,46 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([-1, 3, 128, 128])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=[2, 3, 128, 128]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 3e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 128, 128])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 conv1 = paddle.static.nn.conv2d(
                     x, num_filters=3, filter_size=3, bias_attr=False)
                 conv2 = paddle.static.nn.conv2d(
@@ -70,36 +75,45 @@ def _test_base(self, run_ipu=True):
                 conv4 = paddle.static.nn.conv2d(
                     conv3, num_filters=3, filter_size=3, bias_attr=False)
 
-                fetch_list = [conv4.name]
+            fetch_list = [conv4.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
-                    batch_size=2, is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                # set batch size
+                ipu_strategy.micro_batch_size = 2
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py b/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
deleted file mode 100644
index df0e2a040bd3e5..00000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import sys
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-
-paddle.enable_static()
-SEED = 2021
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestSGD(unittest.TestCase):
-    def _test_sgd(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-
-        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
-                loss = paddle.mean(conv1)
-
-                sgd = paddle.optimizer.SGD(learning_rate=1e-1)
-                sgd.minimize(loss)
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = [image.name]
-                fetch_list = [loss.name]
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=True)
-                program = compiler.IPUCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
-            else:
-                program = main_prog
-
-            result = []
-            for epoch in range(100):
-                loss_res = exe.run(program,
-                                   feed={"image": np_image},
-                                   fetch_list=[loss])
-                result.append(loss_res)
-
-            return np.array(result)
-
-    def test_sgd(self):
-        # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
-        ipu_loss = self._test_sgd(True).flatten()
-        cpu_loss = self._test_sgd(False).flatten()
-
-        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
index 3bdfeabce6592c..8881f018de3b53 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,78 +26,88 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[4, 5, 6]).astype('float32'), }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[4, 5, 6])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "axes": [0, 1, 2],
             "starts": [-3, 0, 2],
             "ends": [3, 2, 4],
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.slice(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "axes": [0, 1],
             "starts": [0, 0],
@@ -113,38 +117,45 @@ def set_attrs(self):
 
 @unittest.skip('dynamic graph is not support on IPU')
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[4, 5, 6]).astype('float32'),
-            "starts": np.array([0, 0, 2]).astype('int32'),
-            "ends": np.array([3, 2, 4]).astype('int32'),
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 5, 6])
+        s = np.array([0, 0, 2])
+        e = np.array([3, 2, 4])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "starts": s.astype(np.int32),
+            "ends": e.astype(np.int32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "starts": s.astype(np.int32),
+            "ends": e.astype(np.int32)
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [0, 1, 2]}
 
     def _test_base(self, run_ipu=True):
         scope = fluid.core.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
         with fluid.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 starts = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='int32')
                 ends = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='int32')
                 out = paddle.fluid.layers.slice(
                     x, starts=starts, ends=ends, **self.attrs)
 
@@ -160,8 +171,8 @@ def _test_base(self, run_ipu=True):
             if run_ipu:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -170,6 +181,9 @@ def _test_base(self, run_ipu=True):
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
+    def test_base(self):
+        pass
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
index a4a4b83baf35e5..25201959cecbc4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
@@ -13,16 +13,11 @@
 # limitations under the License.
 
 import unittest
+
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,76 +26,84 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 2, 20])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.softmax(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 2}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
new file mode 100644
index 00000000000000..59af3a3d6ac17a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
@@ -0,0 +1,113 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data1 = np.random.uniform(size=[1, 3, 10, 10])
+
+        self.feed_fp32 = {'x': data1.astype(np.float32)}
+        self.feed_fp16 = {'x': data1.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"num_or_sections": [1, 1, 1], "axis": 1}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = paddle.split(x, **self.attrs)
+
+                fetch_list = [fetch.name for fetch in out]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled
+                ) or mode == ExecutionMode.IPU_POPART_FP16:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"num_or_sections": [2, 8], "axis": 2}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
index ccd2796590838f..bdc8fb32c84722 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
@@ -13,16 +13,11 @@
 # limitations under the License.
 
 import unittest
+
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,81 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 1, 5]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 1, 5])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [0]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.squeeze(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
-                    iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": []}
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [-2]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
index 3d5de11b5e213e..c807ab9aab65e4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,86 +26,102 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 2]).astype('float32'),
-            "z": np.random.uniform(size=[1, 2]).astype('float32'),
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 2])
+        y = np.random.uniform(size=[1, 2])
+        z = np.random.uniform(size=[1, 2])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+            "z": z.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+            "z": z.astype(np.float16)
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
                 z = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.stack([x, y, z], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": -2}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
index 003350cd7a01e2..12351cb63d6c8b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,131 +26,154 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-        }
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 3, 2, 2])
+        y = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.sum([x, y], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode)
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 @unittest.skip('')
 class TestCase1(TestBase):
     def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
-            "z": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'),
+        x = np.random.uniform(size=[1, 3, 2, 2])
+        y = np.random.uniform(size=[1, 3, 2, 2])
+        z = np.random.uniform(size=[1, 3, 2, 2])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+            "z": y.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+            "z": y.astype(np.float16)
         }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
                 z = paddle.static.data(
                     name=self.feed_list[2],
                     shape=self.feed_shape[2],
-                    dtype=self.feed_dtype[2])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.sum([x, y, z], **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
             result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
             return result[0]
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
index 9915a7a1fd89f9..ef75aee78049b5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
@@ -16,130 +16,125 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestTopKOp(IPUOpTest):
     def setUp(self):
-        self.set_ops()
         self.set_atol()
         self.set_training()
-        self.k = 3
-        self.use_K_as_const_variable = False
-
-        self.set_feed()
-        self.set_attrs()
-
-    def set_ops(self):
-        self.ops = [
-            paddle.fluid.layers.topk,
-            paddle.topk  # use top_k_v2 implementation
-        ]
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([3, 5])
-
-        self.feed = {}
-        self.feed_list = []
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-        self.feed_list.append("in_0")
-        if self.use_K_as_const_variable:
-            # self.feed["in_1"] = np.array([self.k]).astype("int32")
-            # self.feed_list.append("in_1")
-            pass
-        print("[TestTopKop] feed data:\n%s" % self.feed["in_0"])
-
-    def set_attrs(self):
-        self.attrs = {
-            # "axis": -1,
-            # "sorted": True
-        }
-        if not self.use_K_as_const_variable:
-            self.attrs["k"] = self.k
-
-    def _test_base(self, run_ipu=True, op=None, data_feed=None):
-        assert (op is not None)
-        assert (data_feed is not None)
-        scope = fluid.core.Scope()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_test_op()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.topk
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[3, 5])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.use_k_as_const_variable = False
+        self.attrs = {}
+        if not self.use_k_as_const_variable:
+            self.attrs["k"] = 3
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
-                if not self.use_K_as_const_variable:
-                    topk_values, topk_indices = op(x, **self.attrs)
+
+                if not self.use_k_as_const_variable:
+                    topk_values, topk_indices = self.op(x, **self.attrs)
                 else:
                     # !important, popart cannot accept non const tensor
-                    # K_t = paddle.static.data(name="in_1", shape=[1], dtype='int32')
-                    K_t = fluid.layers.fill_constant(
+                    K_t = paddle.fluid.layers.fill_constant(
                         shape=[1], dtype='int32', value=self.k, name="in_2")
-                    topk_values, topk_indices = op(x, K_t, **self.attrs)
+                    topk_values, topk_indices = self.op(x, K_t, **self.attrs)
+
                 fetch_list = [topk_values.name, topk_indices.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            print("Running inference ...")
-            result = exe.run(program, feed=data_feed, fetch_list=fetch_list)
-            print("Complete running infrence.")
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result
 
     def test_base(self):
-        for op in self.ops:
-            res0_topk_values, res0_topk_indices = self._test_base(
-                True, op=op, data_feed=self.feed)
-            res1_topk_values, res1_topk_indices = self._test_base(
-                False, op=paddle.fluid.layers.topk, data_feed=self.feed)
-
-            print("[TestTopKop] IPU res0 values:\n%s\n" % res0_topk_values)
-            print("[TestTopKop] CPU res1 values:\n%s\n" % res1_topk_values)
-            view_type = np.uint32
-            print("[TestTopKop] IPU res0 indices:\n%s\n" %
-                  res0_topk_indices.astype(view_type))
-            print("[TestTopKop] CPU res1 indices:\n%s\n" % res1_topk_indices)
-
-            self.assertTrue(
-                np.allclose(
-                    res0_topk_values.flatten(),
-                    res1_topk_values.flatten(),
-                    atol=self.atol))
-
-            self.assertTrue(
-                np.allclose(
-                    res0_topk_indices.astype(view_type).flatten(),
-                    res1_topk_indices.flatten(),
-                    atol=self.atol))
+        value_dict = {}
+        index_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            value, index = self._test_base(mode)
+            value_dict[mode] = value
+            index_dict[mode] = index
+
+        self.check(value_dict)
+        self.check(index_dict)
+
+
+class TestCase2(TestTopKOp):
+    def set_test_op(self):
+        self.op = paddle.topk
+
+
+@unittest.skip("Trying to get data as int64 but it is of type int32")
+class TestCase3(TestTopKOp):
+    def set_op_attrs(self):
+        self.use_k_as_const_variable = True
+        self.attrs = {}
+        self.k = 2
+
+
+@unittest.skip("Trying to get data as int64 but it is of type int32")
+class TestCase4(TestCase3):
+    def set_test_op(self):
+        self.op = paddle.topk
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
index 77d2f413101496..1747bde20b6a63 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,86 +26,94 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [0, 2, 3, 1]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.transpose(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [0, 1, 2, 3]}
 
 
 class TestCase2(TestBase):
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 2, 3, 4, 5]).astype('float32'),
-        }
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 2, 3, 4, 5])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"perm": [4, 0, 2, 3, 1]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
index 75ed5a07315c77..e068c2e3b59083 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,79 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
 
-    def set_feed(self):
-        self.feed = {"x": np.random.uniform(size=[1, 2, 3]).astype('float32')}
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 2, 3])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": 0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.unsqueeze(x, **self.attrs)
 
-                fetch_list = [out.name]
+            fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict, check_shape=True)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": -1}
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axes": [1, 2]}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
similarity index 79%
rename from python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
rename to python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
index fabad936decb97..5cc62432dc635b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
@@ -16,15 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-from paddle.fluid.executor import global_scope
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -33,11 +26,11 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
         }
@@ -45,25 +38,22 @@ def set_feed(self):
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             "shape": [30, 10],
             "inplace": True,
         }
 
     def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
@@ -76,12 +66,13 @@ def _test_base(self, run_ipu=True):
                 scale2 = paddle.fluid.layers.scale(scale1, scale=1.3, bias=0.5)
                 scale3 = paddle.fluid.layers.scale(scale2, scale=2, bias=0.7)
 
-                fetch_list = [scale3.name]
+            fetch_list = [scale3.name]
 
             if run_ipu:
                 place = paddle.IPUPlace()
             else:
                 place = paddle.CPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
             scale1_out = main_prog.global_block().ops[4].output("Out")[0]
@@ -92,8 +83,8 @@ def _test_base(self, run_ipu=True):
             if run_ipu:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
new file mode 100644
index 00000000000000..ecf1c61f52e832
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
@@ -0,0 +1,126 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestWeightSharing(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        x = np.random.randint(0, 768, size=(128, 1)).astype(np.int32)
+        self.feed_cpu = {"x": x.astype(np.int64)}
+        self.feed_ipu = {
+            "x": np.tile(x.astype(np.int64)[np.newaxis, :], [3, 1, 1])
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
+        self.feed_list = list(self.feed_cpu.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int64')
+
+                with paddle.static.ipu_shard_guard(index=0, stage=0):
+                    y = paddle.fluid.layers.embedding(
+                        input=x,
+                        size=[768, 768],
+                        dtype='float32',
+                        param_attr=paddle.fluid.ParamAttr(
+                            name='word_embedding'),
+                        is_sparse=False)
+
+                with paddle.static.ipu_shard_guard(index=1, stage=1):
+                    z = paddle.fluid.layers.fc(
+                        input=y,
+                        size=768,
+                        param_attr=paddle.fluid.ParamAttr(name="fc"))
+
+                with paddle.static.ipu_shard_guard(index=0, stage=2):
+                    out = paddle.fluid.layers.matmul(
+                        x=z,
+                        y=main_prog.global_block().var('word_embedding'),
+                        transpose_y=True)
+
+            fetch_list = [out.name]
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(
+                    num_ipus=2,
+                    is_training=self.is_training,
+                    enable_manual_shard=True)
+                ipu_strategy.set_pipelining_config(
+                    enable_pipelining=True, batches_per_step=3)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_ipu if run_ipu else self.feed_cpu
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        res0 = self._test_base(False)
+        res1 = self._test_base(True)
+
+        self.assertTrue(
+            np.allclose(
+                res0.flatten(), res1[0].flatten(), atol=self.atol))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 6169509e895184..8f7b73fc0e0363 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -80,7 +80,7 @@ if(WITH_NV_JETSON)
   set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
   set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
 else()
-  set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 120)
+  set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 300)
   set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
 endif()
 set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
index e21d67839eb6c0..65fc35f9c56f88 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -37,6 +37,13 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[0]:
             return False
 
+        if attrs[0]['dilations'][0] != 1 or attrs[0]['dilations'][1] != 1:
+            return False
+
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+            return False
+
         return True
 
     def sample_program_configs(self):
@@ -175,9 +182,9 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, False), (1e-5, 1e-5)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -187,41 +194,18 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
         def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs[
-                    'padding_algorithm'] == "SAME" or program_config.ops[
-                        0].attrs['padding_algorithm'] == "VALID":
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
-        )
-
-        def teller2(program_config, predictor_config):
-            if program_config.ops[0].attrs['dilations'][
-                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
-        )
-
-        def teller3(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
                 return True
             return False
 
         self.add_skip_case(
-            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
             "When precisionType is int8 without relu op, output is different between Trt and Paddle."
         )
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
index 9d29034d7fe18d..c692e92861bc6c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
@@ -147,7 +147,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             if len(attrs[0]['paddings']) == 4:
                 return 1, 2
             else:
-                return 1, 2
+                return 1, 4
 
         attrs = [
             program_config.ops[i].attrs
@@ -160,20 +160,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), 1e-5
 
-    def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(program_config.ops[0].attrs["strides"]) != 2:
-                return False
-
-            return True
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "In deformable conv, length of Attr(strides) should be 2.")
-
     def test(self):
         self.trt_param.workspace_size = 1 << 28
-        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
index 66a007f64b69c0..5f77e7de0df423 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
@@ -40,6 +40,13 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         if inputs['input_data'].shape[1] != attrs[0]['groups']:
             return False
 
+        if attrs[0]['dilations'][0] != 1 or attrs[0]['dilations'][1] != 1:
+            return False
+
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+            return False
+
         return True
 
     def sample_program_configs(self):
@@ -139,9 +146,9 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), (1e-5, 1e-3)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, False), (1e-5, 1e-5)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -151,41 +158,18 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-5)
-        self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+        # self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
         def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs[
-                    'padding_algorithm'] == "SAME" or program_config.ops[
-                        0].attrs['padding_algorithm'] == "VALID":
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
-        )
-
-        def teller2(program_config, predictor_config):
-            if program_config.ops[0].attrs['dilations'][
-                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
-        )
-
-        def teller3(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
                 return True
             return False
 
         self.add_skip_case(
-            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
             "When precisionType is int8 without relu op, output is different between Trt and Paddle."
         )
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index c17790bd3200e2..17f5509bdb9584 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -13,13 +13,17 @@ if (WITH_MLU)
     endforeach(TEST_OP)
 
     if(WITH_CNCL)
-	foreach(TEST_OP ${TEST_DIST_OPS})
+        foreach(TEST_OP ${TEST_DIST_OPS})
             py_test_modules(${TEST_OP} MODULES ${TEST_OP})
         endforeach(TEST_OP)
         bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-	bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-	bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_c_comm_init_op_mlu START_BASH test_c_comm_init_op_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120)
-	set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_broadcast_api_mlu PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_collective_allreduce_api_mlu PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120)
     endif(WITH_CNCL)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
new file mode 100644
index 00000000000000..e91f28e3b1db86
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import os
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
+import paddle
+
+paddle.enable_static()
+
+
+class TestCCommInitOp(unittest.TestCase):
+    def setUp(self):
+        self.endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')
+        self.current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        self.nranks = len(self.endpoints)
+        self.rank = self.endpoints.index(self.current_endpoint)
+        self.mlu_id = int(os.getenv("FLAGS_selected_mlus"))
+        self.place = fluid.MLUPlace(self.mlu_id)
+        self.exe = fluid.Executor(self.place)
+        self.endpoints.remove(self.current_endpoint)
+        self.other_endpoints = self.endpoints
+        if self.rank == 0:
+            wait_server_ready(self.other_endpoints)
+
+    def test_specifying_devices(self):
+        program = fluid.Program()
+        block = program.global_block()
+        cncl_id_var = block.create_var(
+            name=fluid.unique_name.generate('cncl_id'),
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+        block.append_op(
+            type='c_gen_cncl_id',
+            inputs={},
+            outputs={'Out': cncl_id_var},
+            attrs={
+                'rank': self.rank,
+                'endpoint': self.current_endpoint,
+                'other_endpoints': self.other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': cncl_id_var},
+            outputs={},
+            attrs={
+                'nranks': self.nranks,
+                'rank': self.rank,
+                'ring_id': 0,
+                'device_id': self.mlu_id
+            })
+        self.exe.run(program)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
new file mode 100644
index 00000000000000..ebe4e71d22fdee
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            paddle.distributed.all_reduce(tindata)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduceAPI, "allreduce")
diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
new file mode 100644
index 00000000000000..2002909ea2eec5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype="float32")
+            paddle.distributed.broadcast(tindata, src=1)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveBroadcastAPI, "broadcast")
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
new file mode 100644
index 00000000000000..97f21798c11545
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+# use default values
+# FIXME: random fails on Unknown command lines -c (or -m).
+launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
+MLU_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op_mlu.py
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
new file mode 100644
index 00000000000000..447498b9022d4b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_cncl_fp16(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "float16")
+
+    def test_allreduce_cncl_fp32(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "float32")
+
+    def test_allreduce_cncl_int32(self):
+        self.check_with_place("collective_allreduce_api.py", "allreduce",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
new file mode 100644
index 00000000000000..556fc6fcbb75fc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import unittest
+import os
+import sys
+import subprocess
+import pickle
+from contextlib import closing
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+def DataTypeCast(date_type):
+    np_data_type = None
+
+    if date_type == "float16":
+        np_data_type = np.float16
+    elif date_type == "float32":
+        np_data_type = np.float32
+    elif date_type == "int32":
+        np_data_type = np.int32
+    else:
+        raise ValueError("This data type is not support!")
+
+    return np_data_type
+
+
+class TestCollectiveAPIRunnerBase(object):
+    def get_model(self, train_prog, startup_prog, rank, indata=None):
+        raise NotImplementedError(
+            "get model should be implemented by child class.")
+
+    def run_trainer(self, args):
+        train_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        endpoints = args["endpoints"].split(",")
+        rank = args["trainerid"]
+        current_endpoint = args["currentendpoint"]
+        nranks = 2
+        paddle.distributed.init_parallel_env()
+        device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
+        place = fluid.MLUPlace(device_id)
+        np.random.seed(os.getpid())
+        np_data_type = DataTypeCast(args["data_type"])
+        indata = np.random.random((10, 1000)).astype(np_data_type)
+        if args['static_mode']:
+            result = self.get_model(train_prog, startup_prog, rank)
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+            fetch_list = []
+            for elem in result:
+                fetch_list.append(elem.name)
+            out = exe.run(train_prog,
+                          feed={'tindata': indata},
+                          fetch_list=fetch_list)
+        else:
+            out = self.get_model(train_prog, startup_prog, rank, indata)
+            #print(out, sys.stderr)
+        sys.stdout.buffer.write(pickle.dumps(out))
+
+
+def runtime_main(test_class, col_type):
+    args = {}
+    model = test_class()
+    args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
+    args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
+    args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+    args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    args["col_type"] = col_type
+    args["backend"] = os.getenv("BACKEND")
+    args["path_id"] = int(os.getenv("PATH_ID"))
+    args["static_mode"] = int(os.getenv("STATIC_MODE"))
+    args["data_type"] = os.getenv("DATA_TYPE")
+    model.run_trainer(args)
+
+
+import paddle.compat as cpt
+import socket
+from contextlib import closing
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._port_set = set()
+        self._trainers = 2
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
+        self._python_interp = sys.executable
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _run_cluster(self, model_file, envs):
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        env0 = {
+            "FLAGS_selected_mlus": "0",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w0_ep
+        }
+
+        env1 = {
+            "FLAGS_selected_mlus": "1",
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w1_ep
+        }
+        #update environment
+        env0.update(envs)
+        env1.update(envs)
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            tr_cmd = "%s -m coverage run --branch -p %s"
+        else:
+            tr_cmd = "%s %s"
+        tr0_cmd = tr_cmd % (self._python_interp, model_file)
+        tr1_cmd = tr_cmd % (self._python_interp, model_file)
+        tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
+        tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
+        #print(tr0_cmd) 
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        with open("/tmp/tr0_err_%d.log" % os.getpid(), "r") as f:
+            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
+        with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
+            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
+        return pickle.loads(tr0_out), pickle.loads(
+            tr1_out), tr0_proc.pid, tr1_proc.pid
+
+    def check_with_place(self,
+                         model_file,
+                         col_type,
+                         data_type,
+                         path_id="0",
+                         static_mode="1",
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_eager_delete_tensor_gb": "0.0",
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+            "FLAGS_call_stack_level": "2",
+            "GLOG_v": "3",
+            "STATIC_MODE": static_mode,
+            "PADDLE_WITH_GLOO": '0',
+            "BACKEND": "cncl",
+            "PATH_ID": path_id,
+            "DATA_TYPE": data_type
+        }
+        required_envs.update(need_envs)
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+            required_envs["GLOO_LOG_LEVEL"] = "TRACE"
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
+                                                         required_envs)
+        np_data_type = DataTypeCast(data_type)
+        np.random.seed(pid0)
+        input1 = np.random.random((10, 1000)).astype(np_data_type)
+        np.random.seed(pid1)
+        input2 = np.random.random((10, 1000)).astype(np_data_type)
+        if col_type == "broadcast":
+            need_result = input2
+            self.assertTrue(np.allclose(tr0_out, need_result))
+            self.assertTrue(np.allclose(tr1_out, need_result))
+        elif col_type == "allreduce":
+            need_result = input1 + input2
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        else:
+            pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py
new file mode 100644
index 00000000000000..95919f33328691
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base_mlu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveBroadcastAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_broadcast_cncl_fp16(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "float16")
+
+    def test_broadcast_cncl_fp32(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "float32")
+
+    def test_broadcast_cncl_int32(self):
+        self.check_with_place("collective_broadcast_api.py", "broadcast",
+                              "int32")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
new file mode 100644
index 00000000000000..37a24885be1bf6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+def init_process_group(strategy=None):
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks)
+    pg_group = core.ProcessGroupHCCL(store, rank, nranks)
+
+    return pg_group
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_nccl(self):
+        with _test_eager_guard():
+            paddle.set_device('npu:%d' %
+                              paddle.distributed.ParallelEnv().dev_id)
+
+            pg = init_process_group()
+
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                assert np.array_equal(tensor_y, sum_result)
+
+            print("test allreduce sum api ok")
+
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            max_result = paddle.maximum(tensor_x, tensor_y)
+
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_y, max_result)
+
+            print("test allreduce max api ok")
+
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            broadcast_result = paddle.assign(tensor_x)
+            if pg.rank() == 0:
+                task = pg.broadcast(tensor_x, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                task.synchronize()
+                paddle.device.cuda.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_y)
+
+            print("test broadcast api ok")
+
+            # test barrier
+            # rank 0
+            if pg.rank() == 0:
+                task = pg.barrier()
+                task.wait()
+            # rank 1
+            else:
+                task = pg.barrier()
+                task.wait()
+
+            print("test barrier api ok\n")
+            exit(0)
+
+            # test allgather
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+
+            # test alltoall
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            out1 = np.random.random(self.shape).astype(self.dtype)
+            out2 = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            tensor_out1 = paddle.to_tensor(out1)
+            tensor_out2 = paddle.to_tensor(out2)
+            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
+                                          [self.shape[0]])
+            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
+                                          [self.shape[0] // 2])
+            if pg.rank() == 0:
+                task = pg.alltoall(tensor_x, tensor_out1)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.alltoall(tensor_y, tensor_out2)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
+                                  [self.shape[0]])
+            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
+            if pg.rank() == 0:
+                assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+            else:
+                assert np.array_equal(out2_1, raw_tensor_x_2)
+            print("test alltoall api ok\n")
+
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            if pg.rank() == 0:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            if pg.rank() == 0:
+                assert np.array_equal(tensor_y, out1)
+            else:
+                assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
+
+
+class TestProcessGroupFp16(TestProcessGroupFp32):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float16"
+        self.shape = (4, 20, 20)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py
new file mode 100644
index 00000000000000..9b2c6fae15eb4b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+sys.path.append("..")
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestProcessGroup(TestMultipleGpus):
+    def test_process_group_nccl(self):
+        self.run_mnist_2gpu('process_group_hccl.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 628791afef5f66..457f20ac5b06be 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -29,6 +29,7 @@
 
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid.framework import _dygraph_tracer
 import paddle.fluid.core as core
 from paddle.fluid.framework import _in_eager_mode
 from paddle.fluid.framework import _test_eager_guard
@@ -49,6 +50,7 @@
     no_check_set_white_list,
     op_threshold_white_list,
     no_grad_set_white_list, )
+from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 
 
 def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs):
@@ -395,6 +397,7 @@ def is_xpu_op(self):
             hasattr(self, "attrs") and "use_xpu" in self.attrs and
             self.attrs["use_xpu"] == True)
 
+    # set the self.output_dtype .
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def is_np_data(input):
             return isinstance(input, (np.ndarray, np.generic))
@@ -679,6 +682,133 @@ def create_var(np_value, name, is_input, if_return_inputs_grad_dict):
         else:
             return var_dict
 
+    def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place):
+        """ for quick verify, here we take a simplest strategy:
+                1. we only check variable in api_outs.
+                2. we simply check the numpy (tensor) .
+                3. we set atol and rtol as 1e-5, because they are unrelated to dtype.
+        """
+        for name in api_outs:
+            np_api = np.array(api_outs[name])
+            np_dyg = np.array(dygraph_outs[name])
+            self.assertTrue(
+                np.allclose(
+                    np_api, np_dyg, equal_nan=False),
+                "Output (" + name + ") has diff at " + str(place) + "\nExpect "
+                + str(np_dyg) + "\n" + "But Got" + str(np_api) + " in class " +
+                self.__class__.__name__)
+
+    def _calc_python_api_output(self, place):
+        def prepare_python_api_arguments(api, op_proto_ins, op_proto_attrs,
+                                         kernel_sig):
+            """ map from `op proto inputs and attrs` to `api input list and api attrs dict`
+            """
+
+            class Empty:
+                pass
+
+            def is_empty(a):
+                return isinstance(a, Empty)
+
+            def get_default(idx, all_params_number, defaults):
+                related_idx = idx - all_params_number + len(defaults)
+                assert related_idx >= 0, "%d-th arguments don't have default value" % idx
+                return defaults[related_idx]
+
+            def remove_name(x):
+                if isinstance(x, list): return [i for i in x if i != 'name']
+                if isinstance(x, dict):
+                    return {k: v for k, v in x.items() if k != 'name'}
+                assert False, "Only support list or dict."
+
+            def to_defaults_list(params, defaults):
+                return [defaults[p] for p in params if p in defaults]
+
+            # NOTE(xiongkun): why don't use input arguments dicts ? 
+            # Because we don't know the python api name of each arguments.
+            # using parse_arg_and_kwargs, we can get the all api information we need.
+            api_params, api_defaults = [
+                remove_name(item) for item in parse_arg_and_kwargs(api)
+            ]
+            api_defaults = to_defaults_list(api_params, api_defaults)
+            inputs_sig, attrs_sig, outputs_sig = kernel_sig
+            inputs_and_attrs = inputs_sig + attrs_sig
+            assert (
+                len(api_params) == len(inputs_and_attrs)
+            ), "inputs and attrs length must equals to python api length. (May be output is in argument list?)"
+            input_arguments = [op_proto_ins[name] for name in inputs_sig] + [
+                op_proto_attrs[name] if name in op_proto_attrs else Empty()
+                for name in attrs_sig
+            ]
+            results = []
+            for idx, arg in enumerate(input_arguments):
+                if is_empty(arg):
+                    results.append(
+                        get_default(idx, len(input_arguments), api_defaults))
+                else:
+                    results.append(arg)
+            return results
+
+        def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
+            if not isinstance(ret_tuple, (tuple, list)):
+                ret_tuple = [ret_tuple]
+            assert len(output_sig) == len(
+                ret_tuple), "expect %d outputs, but get %d outputs" % (
+                    len(output_sig), len(ret_tuple))
+            return {a: b for a, b in zip(output_sig, ret_tuple)}
+
+        def assumption_assert_and_transform(args, inp_num):
+            """
+            transform inputs by the following rules:
+                1. [Tensor] -> Tensor
+                2. [Tensor, Tensor, ...] -> list of Tensors
+
+            only support "X" is list of Tensor, currently don't support other structure like dict.
+            """
+            for inp in args[:inp_num]:
+                assert isinstance(
+                    inp, list
+                ), "currently only support `X` is [Tensor], don't support other structure."
+            args = [
+                inp[0] if len(inp) == 1 else inp for inp in args[:inp_num]
+            ] + args[inp_num:]
+            return args
+
+        def cal_python_api(python_api, args, kernel_sig):
+            inputs_sig, attrs_sig, outputs_sig = kernel_sig
+            args = assumption_assert_and_transform(args, len(inputs_sig))
+            ret_tuple = python_api(*args)
+            return construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig)
+
+        with fluid.dygraph.base.guard(place=place):
+            block = fluid.default_main_program().global_block()
+            op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
+            # prepare input variable
+            inputs = self.append_input_output_for_dygraph(op_proto, self.inputs,
+                                                          True, False, block)
+            # prepare output variable
+            outputs = self.append_input_output_for_dygraph(
+                op_proto, self.outputs, False, False, block)
+
+            # prepare attrbutes
+            attrs_outputs = {}
+            if hasattr(self, "attrs"):
+                for attrs_name in self.attrs:
+                    if self.attrs[attrs_name] is not None:
+                        attrs_outputs[attrs_name] = self.attrs[attrs_name]
+
+            kernel_sig = _dygraph_tracer()._get_kernel_signature(
+                self.op_type, inputs, outputs, attrs_outputs)
+
+            assert hasattr(
+                self, "python_api"
+            ), "Please set the `self.python_api` if you want to compare python api output."
+            args = prepare_python_api_arguments(self.python_api, inputs,
+                                                attrs_outputs, kernel_sig)
+            """ we directly return the cal_python_api value because the value is already tensor. 
+            """
+            return cal_python_api(self.python_api, args, kernel_sig)
+
     def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
         self.__class__.op_type = self.op_type  # for ci check, please not delete it for now
         with fluid.dygraph.base.guard(place=place):
@@ -699,6 +829,7 @@ def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
                 for attrs_name in self.attrs:
                     if self.attrs[attrs_name] is not None:
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
+
             block.append_op(
                 type=self.op_type,
                 inputs=inputs,
@@ -1150,10 +1281,17 @@ def check_output_with_place(self,
         if check_dygraph:
             dygraph_outs = self._calc_dygraph_output(
                 place, no_check_set=no_check_set)
+
         if check_eager:
             with _test_eager_guard():
                 eager_dygraph_outs = self._calc_dygraph_output(
                     place, no_check_set=no_check_set)
+            # we only check end2end api when check_eager=True
+            if hasattr(self, "python_api"):
+                api_outs = self._calc_python_api_output(place)
+                self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs,
+                                                     place)
+
         outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
 
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py
index 4833cea9a8d1ab..b1da0777feb3de 100644
--- a/python/paddle/fluid/tests/unittests/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -27,22 +27,13 @@
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.dygraph.parallel import ParallelEnv
 
-ProcessGroupStrategy = core.ProcessGroupStrategy
-
 
 def init_process_group(strategy=None):
-    # this will remove
-    if strategy is None:
-        strategy = ProcessGroupStrategy()
-        strategy.nranks = ParallelEnv().nranks
-        strategy.local_rank = ParallelEnv().local_rank
-        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-        strategy.current_endpoint = ParallelEnv().current_endpoint
-    if strategy.nranks < 2:
-        return
-
-    pg_group = core.ProcessGroupNCCL(strategy, strategy.local_rank,
-                                     strategy.nranks)
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks)
+    pg_group = core.ProcessGroupNCCL(store, rank, nranks)
 
     return pg_group
 
diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
index bc87fc255a59bf..0fd64b0d923051 100755
--- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
+++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
@@ -382,6 +382,7 @@ def run_single_pass(self):
             ps_optimizer = ParameterServerOptimizer(inner_optimizer)
             ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
                                          user_defined_strategy)
+            ps_optimizer._set_origin_programs([loss])
             ps_optimizer._init_ps_pass_context(loss, startup_program)
             _main = ps_optimizer.pass_ctx._attrs['cloned_main']
 
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
new file mode 100644
index 00000000000000..5f467da6a64654
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -0,0 +1,384 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.dygraph.layers import Layer
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.nn.initializer import Constant
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+
+def fused_feedforward(x,
+                      linear1_weight,
+                      linear2_weight,
+                      linear1_bias=None,
+                      linear2_bias=None,
+                      ln1_scale=None,
+                      ln1_bias=None,
+                      ln2_scale=None,
+                      ln2_bias=None,
+                      dropout1_rate=0.5,
+                      dropout2_rate=0.5,
+                      activation="relu",
+                      ln1_epsilon=1e-5,
+                      ln2_epsilon=1e-5,
+                      pre_layer_norm=False,
+                      training=True,
+                      mode='upscale_in_train',
+                      ring_id=-1,
+                      name=None):
+    seed = None
+    if mode not in ('downscale_in_infer', 'upscale_in_train'):
+        raise ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+    helper = LayerHelper("fused_feedforward")
+    dtype = x.dtype
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'fused_feedforward')
+    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+                'fused_feedforward')
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    dropout1_mask = helper.create_variable_for_type_inference(
+        'uint8', stop_gradient=True)
+    dropout2_mask = helper.create_variable_for_type_inference(
+        'uint8', stop_gradient=True)
+    ln1_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln1_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln2_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln2_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    linear1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    ln1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    dropout1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+    dropout2_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True)
+
+    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+        seed = helper.main_program.random_seed
+
+    helper.append_op(
+        type='fused_feedforward',
+        inputs={
+            'X': x,
+            'Linear1Weight': linear1_weight,
+            'Linear1Bias': linear1_bias,
+            'Linear2Weight': linear2_weight,
+            'Linear2Bias': linear2_bias,
+            'Ln1Scale': ln1_scale,
+            'Ln1Bias': ln1_bias,
+            'Ln2Scale': ln2_scale,
+            'Ln2Bias': ln2_bias,
+        },
+        outputs={
+            'Out': out,
+            'Dropout1Mask': dropout1_mask,
+            'Dropout2Mask': dropout2_mask,
+            'Ln1Mean': ln1_mean,
+            'Ln1Variance': ln1_variance,
+            'Ln2Mean': ln2_mean,
+            'Ln2Variance': ln2_variance,
+            'Linear1Out': linear1_out,
+            'Ln1Out': ln1_out,
+            'Dropout1Out': dropout1_out,
+            'Dropout2Out': dropout2_out,
+        },
+        attrs={
+            'dropout1_rate': dropout1_rate,
+            'dropout2_rate': dropout2_rate,
+            'act_method': activation,
+            'pre_layer_norm': pre_layer_norm,
+            'ln1_epsilon': ln1_epsilon,
+            'ln2_epsilon': ln2_epsilon,
+            'dropout1_is_test': not training,
+            'dropout2_is_test': not training,
+            'dropout1_fix_seed': seed is not None,
+            'dropout2_fix_seed': seed is not None,
+            'dropout1_seed': seed if seed is not None else 0,
+            'dropout2_seed': seed if seed is not None else 0,
+            'dropout1_implementation': mode,
+            'dropout2_implementation': mode,
+            'ring_id': ring_id,
+        })
+    return out
+
+
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    # NOTE: use current_block and find_var_recursive to support while_loop
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(var.name).is_distributed = True
+    main_block._find_var_recursive(var.name).is_distributed = True
+
+
+class ParallelFusedFeedForward(Layer):
+    def __init__(self,
+                 d_model,
+                 dim_feedforward,
+                 dropout_rate=0.1,
+                 epsilon=1e-05,
+                 activation="relu",
+                 act_dropout_rate=None,
+                 normalize_before=False,
+                 linear1_weight_attr=None,
+                 linear1_bias_attr=None,
+                 linear2_weight_attr=None,
+                 linear2_bias_attr=None,
+                 ln1_scale_attr=None,
+                 ln1_bias_attr=None,
+                 ln2_scale_attr=None,
+                 ln2_bias_attr=None,
+                 nranks=1,
+                 ring_id=-1,
+                 name=None):
+        super(ParallelFusedFeedForward, self).__init__()
+        assert d_model > 0, (
+            "Expected d_model to be greater than 0, but recieved {}".format(
+                d_model))
+        assert dim_feedforward > 0, (
+            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            format(dim_feedforward))
+
+        self._dtype = self._helper.get_default_dtype()
+        self._d_model = d_model
+
+        assert dim_feedforward % nranks == 0
+        dim_feedforward = dim_feedforward // nranks
+        self._dim_feedforward = dim_feedforward
+        self._dropout_rate = dropout_rate
+        self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        self._act_method = activation
+        self._normalize_before = normalize_before
+        self._epsilon = epsilon
+        self._ring_id = ring_id
+
+        self._linear1_weight = self.create_parameter(
+            shape=[d_model, dim_feedforward],
+            attr=linear1_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self._linear1_bias = self.create_parameter(
+            shape=[dim_feedforward],
+            attr=linear1_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        self._linear2_weight = self.create_parameter(
+            shape=[dim_feedforward, d_model],
+            attr=linear2_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+
+        self._linear2_bias = self.create_parameter(
+            shape=[d_model],
+            attr=linear2_bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self._linear1_weight)
+            _set_var_distributed(self._linear1_bias)
+            _set_var_distributed(self._linear2_weight)
+
+        if normalize_before:
+            self._ln1_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln1_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln1_bias = self.create_parameter(
+                shape=[d_model], attr=ln1_bias_attr, is_bias=True)
+            self._ln2_scale = None
+            self._ln2_bias = None
+        else:
+            self._ln1_bias = None
+            self._ln2_bias = None
+            self._ln2_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln2_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln2_bias = self.create_parameter(
+                shape=[d_model], attr=ln2_bias_attr, is_bias=True)
+
+        self.name = name
+
+    def forward(self, src, cache=None):
+        out = fused_feedforward(
+            src,
+            self._linear1_weight,
+            self._linear2_weight,
+            self._linear1_bias,
+            self._linear2_bias,
+            self._ln1_scale,
+            self._ln1_bias,
+            self._ln2_scale,
+            self._ln2_bias,
+            dropout1_rate=self._act_dropout_rate,
+            dropout2_rate=self._dropout_rate,
+            activation=self._act_method,
+            ln1_epsilon=self._epsilon,
+            ln2_epsilon=self._epsilon,
+            pre_layer_norm=self._normalize_before,
+            training=self.training,
+            ring_id=self._ring_id,
+            name=self.name)
+        return out
+
+
+def get_param_attr(weight, bias):
+    weight_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(weight))
+    bias_attr = paddle.ParamAttr(
+        initializer=fluid.initializer.NumpyArrayInitializer(bias))
+    return weight_attr, bias_attr
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    ln_w = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    ln_b = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    w0 = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    b0 = np.random.uniform(-1, 1, size=(OUT_SIZE, )).astype(DTYPE)
+    w1 = np.random.uniform(-1, 1, size=(OUT_SIZE, IN_SIZE)).astype(DTYPE)
+    b1 = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE)
+    data.stop_gradient = False
+    if rank is not None:
+        start = 0 if rank == 0 else OUT_SIZE // MODEL_PARALLEL_SIZE
+        end = start + OUT_SIZE // MODEL_PARALLEL_SIZE
+        col_w0 = w0[:, start:end]
+        col_b0 = b0[start:end]
+        row_w1 = w1[start:end, :]
+
+        ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b)
+        w0_attr, b0_attr = get_param_attr(col_w0, col_b0)
+        w1_attr, b1_attr = get_param_attr(row_w1, b1)
+
+        ffn = ParallelFusedFeedForward(
+            IN_SIZE,
+            OUT_SIZE,
+            dropout_rate=0.0,
+            activation='gelu',
+            normalize_before=True,
+            linear1_weight_attr=w0_attr,
+            linear1_bias_attr=b0_attr,
+            linear2_weight_attr=w1_attr,
+            linear2_bias_attr=b1_attr,
+            ln1_scale_attr=ln_w_attr,
+            ln1_bias_attr=ln_b_attr,
+            nranks=MODEL_PARALLEL_SIZE,
+            ring_id=0)
+        #ffn.eval()
+        result = ffn(data)
+    else:
+        ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b)
+        w0_attr, b0_attr = get_param_attr(w0, b0)
+        w1_attr, b1_attr = get_param_attr(w1, b1)
+
+        ffn = ParallelFusedFeedForward(
+            IN_SIZE,
+            OUT_SIZE,
+            dropout_rate=0.0,
+            activation='gelu',
+            normalize_before=True,
+            linear1_weight_attr=w0_attr,
+            linear1_bias_attr=b0_attr,
+            linear2_weight_attr=w1_attr,
+            linear2_bias_attr=b1_attr,
+            ln1_scale_attr=ln_w_attr,
+            ln1_bias_attr=ln_b_attr)
+        #ffn.eval()
+        result = ffn(data)
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        seq_len = 2
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, seq_len, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([seq_len, IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index b4b5944e27cc6a..5c40b898d2325b 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -183,6 +183,34 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSigmoidBF16(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
+        out = 1 / (1 + np.exp(-x))
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
 class TestSilu(TestActivation):
     def setUp(self):
         self.op_type = "silu"
@@ -945,6 +973,34 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSqrtBF16(OpTest):
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.init_dtype()
+
+        np.random.seed(1023)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        out = np.sqrt(x)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
 class TestRsqrt(TestActivation):
     def setUp(self):
         self.op_type = "rsqrt"
@@ -2195,6 +2251,34 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSquareBF16(OpTest):
+    def setUp(self):
+        self.op_type = "square"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        out = np.square(x)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.5)
+
+
 class TestPow(TestActivation):
     def setUp(self):
         self.op_type = "pow"
@@ -2433,6 +2517,35 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftplusBF16(OpTest):
+    def setUp(self):
+        self.op_type = "softplus"
+        self.init_dtype()
+
+        beta = 2
+        threshold = 15
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [10, 12]).astype(np.float32)
+        out = ref_softplus(x, beta, threshold)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.attrs = {'beta': beta, "threshold": threshold}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', numeric_grad_delta=0.05)
+
+
 class TestSoftplusAPI(unittest.TestCase):
     # test paddle.nn.Softplus, paddle.nn.functional.softplus
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 10b7e13dcc334d..4feca1b92505b6 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -25,6 +25,7 @@
 class TestConcatOp(OpTest):
     def setUp(self):
         self.op_type = "concat"
+        self.python_api = paddle.concat
         self.dtype = self.get_dtype()
         self.init_test_data()
         self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index f670f7c38097b3..fd2f642b770d64 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -933,5 +933,65 @@ def test_static(self):
             self.check_static_result(place=place)
 
 
+class TestDropoutBackward(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def cal_grad_upscale_train(self, mask, prob):
+        return mask.astype("float32") / (1 - prob)
+
+    def cal_grad_downscale_in_infer(self, mask):
+        return mask.astype("float32")
+
+    def test_backward_downscale_in_infer(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', 0.5)
+                out.backward()
+
+                self.assertTrue(
+                    np.array_equal(input.gradient(
+                    ), self.cal_grad_downscale_in_infer(mask.numpy())))
+
+    def test_backward_upscale_train(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                prob = 0.5
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', prob,
+                                             "dropout_implementation",
+                                             "upscale_in_train")
+                out.backward()
+
+                self.assertTrue(
+                    np.allclose(input.gradient(
+                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
+    def test_backward_upscale_train_2(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+
+                prob = 0.3
+                input = paddle.uniform([40, 40], dtype="float32")
+                input.stop_gradient = False
+                out, mask = core.ops.dropout(input, 'dropout_prob', prob,
+                                             "dropout_implementation",
+                                             "upscale_in_train")
+                out.backward()
+
+                self.assertTrue(
+                    np.allclose(input.gradient(
+                    ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
+
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
new file mode 100644
index 00000000000000..7c296c7e40e981
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestDygraphGroupSharded(TestMultipleGpus):
+
+    # check group sharded logic as well as the accuracy with single mode
+    def test_dygraph_group_sharded(self):
+        self.run_mnist_2gpu('dygraph_group_sharded_api.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_run_program.py b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
new file mode 100644
index 00000000000000..fc6a5d60ecae46
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_run_program.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard, Variable
+from paddle.fluid import core
+from paddle.fluid.layers.utils import _hash_with_id
+import paddle.compat as cpt
+
+import unittest
+
+
+def _append_backward_desc(main_program, outs):
+    # make sure all status of is_test are False in train mode.
+    program = main_program.clone()
+    targets = []
+    for out in outs:
+        if isinstance(out, Variable):
+            targets.append(program.global_block().var(out.name))
+
+    if targets:
+        paddle.fluid.backward.gradients(targets=targets, inputs=[])
+
+    return program
+
+
+# def _set_grad_type(params, train_program):
+#     # NOTE: if user set sparse gradient mode, the param's gradient
+#     # will be SelectedRows, not LoDTensor. But tracer will just
+#     # set param grad VarBase by forward VarBase(LoDTensor)
+#     # If we don't change grad_var type here, RunProgramOp need
+#     # transform SelectedRows to LoDTensor forcibly, it may not
+#     # be user wanted result.
+#     for param in params:
+#         grad_name = param.name + core.grad_var_suffix()
+#         grad_var = train_program.desc.block(0).find_var(
+#             cpt.to_bytes(grad_name))
+#         # NOTE: cannot find var desc maybe no problem, such as in batch_norm
+#         if grad_var is None:
+#             continue
+#         param._set_grad_type(grad_var.type())
+
+
+def _create_out(var):
+    assert isinstance(var, Variable)
+    var_desc = var.desc
+    varbase = None
+    if not core._in_eager_mode():
+        var_base = core.VarBase(var_desc.dtype(),
+                                var_desc.shape(),
+                                var_desc.name(), var_desc.type(), False)
+    else:
+        var_base = core.eager.Tensor(var_desc.dtype(),
+                                     var_desc.shape(),
+                                     var_desc.name(), var_desc.type(), False)
+    return var_base
+
+
+class TestRunProgram(unittest.TestCase):
+    def test_eager(self):
+        paddle.set_device('cpu')
+        paddle.enable_static()
+        # step 1: construct program
+        x = paddle.static.data(shape=[2, 4], name='x')
+        x.stop_gradient = False
+        y = paddle.static.data(shape=[4, 2], name='y')
+        y.stop_gradient = False
+        out = paddle.matmul(x, y)
+
+        main_program = paddle.static.default_main_program()
+        program = _append_backward_desc(main_program, [out])
+
+        paddle.disable_static('cpu')
+        # step 2: call run_program in eager mode
+        with _test_eager_guard():
+            x_t = paddle.ones([2, 4])
+            x_t.name = "x"
+            x_t.stop_gradient = False
+            y_t = paddle.ones([4, 2])
+            y_t.name = "y"
+            y_t.stop_gradient = False
+
+            fake_var = paddle.zeros([1])
+            fake_var.name = 'Fake_var'
+
+            out_t = _create_out(out)
+
+            scope = core.Scope()
+            attrs = ('global_block', program.desc.block(0), 'start_op_index', 0,
+                     'end_op_index', main_program.desc.block(0).op_size(),
+                     'is_test', False, 'program_id', _hash_with_id(program))
+
+            _C_ops.run_program([x_t, y_t], [fake_var], [out_t], [scope],
+                               [fake_var], *attrs)
+
+            loss = paddle.mean(out_t)
+            loss.backward()
+
+            self.assertTrue(np.array_equal(np.ones([2, 2]) * 4, out_t.numpy()))
+            self.assertTrue(
+                np.array_equal(np.ones([2, 4]) * 0.5, x_t.grad.numpy()))
+            self.assertTrue(
+                np.array_equal(np.ones([4, 2]) * 0.5, y_t.grad.numpy()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
index 5bc2d1cda180ba..95537d4332739b 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
@@ -21,7 +21,7 @@
 import paddle.compat as cpt
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
 
 class TestFillAnyLikeOp(OpTest):
@@ -47,6 +47,25 @@ def init(self):
         self.value = 0.0
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFillAnyLikeOpBfloat16(OpTest):
+    def setUp(self):
+        self.op_type = "fill_any_like"
+        self.dtype = np.uint16
+        self.value = 0.0
+        self.inputs = {'X': np.random.random((219, 232)).astype(np.float32)}
+        self.attrs = {'value': self.value, 'dtype': core.VarDesc.VarType.BF16}
+        self.outputs = {
+            'Out':
+            convert_float_to_uint16(self.value * np.ones_like(self.inputs["X"]))
+        }
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+
 class TestFillAnyLikeOpValue1(TestFillAnyLikeOp):
     def init(self):
         self.value = 1.0
@@ -79,19 +98,6 @@ def setUp(self):
         }
 
 
-class TestFillAnyLikeOpOverflow(TestFillAnyLikeOp):
-    def init(self):
-        self.value = 1e100
-
-    def test_check_output(self):
-        exception = None
-        try:
-            self.check_output(check_dygraph=False)
-        except ValueError as ex:
-            exception = ex
-        self.assertIsNotNone(exception)
-
-
 class TestFillAnyLikeOpFloat16(TestFillAnyLikeOp):
     def init(self):
         self.dtype = np.float16
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 822c952893e118..15071b2b6aa69f 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -83,6 +83,27 @@ def test_check_output(self):
         self.check_output()
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFillConstantBF16Op(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+        self.dtype = np.uint16
+        self.inputs = {}
+        self.attrs = {
+            'shape': [123, 92],
+            'value': 3.8,
+            'dtype': core.VarDesc.VarType.BF16
+        }
+        self.outputs = {'Out': convert_float_to_uint16(np.full((123, 92), 3.8))}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+
 class TestFillConstantOpWithSelectedRows(unittest.TestCase):
     def check_with_place(self, place):
         scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py
index 14a59b413383f8..44b94cd3b66eec 100644
--- a/python/paddle/fluid/tests/unittests/test_fold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fold_op.py
@@ -174,6 +174,15 @@ def test_output_size():
                     x, output_sizes=[6, 6], kernel_sizes=[2, 2],
                     strides=[1, 1])
 
+            def test_output_size_2():
+                # out_size must GT 1
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[0.1, 0.2],
+                    kernel_sizes=[2, 2],
+                    strides=[1, 1])
+
             def test_block_h_w():
                 # test_block_h_w GT 0
                 x = paddle.randn(shape=[2, 1, 1], dtype="float32")
@@ -196,6 +205,7 @@ def test_GT_0():
             self.assertRaises(AssertionError, test_dilations_shape)
             self.assertRaises(AssertionError, test_strides_shape)
             self.assertRaises(ValueError, test_output_size)
+            self.assertRaises(ValueError, test_output_size_2)
             self.assertRaises(ValueError, test_block_h_w)
             self.assertRaises(ValueError, test_GT_0)
 
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
index be6abb17c3c316..3ae2e9ff6bdaf7 100644
--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -62,6 +62,15 @@ def test_full_like_imperative(self):
         self.assertTrue((out.numpy() == out_numpy).all(), True)
         paddle.enable_static()
 
+    def test_full_like_fill_inf(self):
+        paddle.disable_static()
+        input = paddle.arange(6, 10, dtype='float32')
+        out = paddle.full_like(input, fill_value=float('inf'))
+        out_numpy = np.random.random((4)).astype("float32")
+        out_numpy.fill(float('inf'))
+        self.assertTrue((out.numpy() == out_numpy).all(), True)
+        paddle.enable_static()
+
 
 class TestFullOpError(unittest.TestCase):
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_function_hook.py b/python/paddle/fluid/tests/unittests/test_function_hook.py
index d45ef528261f39..55981b01c40843 100644
--- a/python/paddle/fluid/tests/unittests/test_function_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_function_hook.py
@@ -20,6 +20,7 @@
 
 import paddle.fluid.core as core
 from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestCapture:
@@ -41,7 +42,7 @@ def grad_hook(grad):
 
 
 class TestBakcwardFunctionHookError(unittest.TestCase):
-    def test_hook(self):
+    def func_hook(self):
         input_data = np.ones([4, 4]).astype('float32')
 
         x = paddle.to_tensor(input_data.astype(np.float32), stop_gradient=False)
@@ -58,6 +59,12 @@ def test_hook(self):
 
         assert test_cap.list == [1, 2, 1]
 
+    def test_hook(self):
+        # _register_void_function_post_hook do not support in eager mode
+        with _test_eager_guard():
+            pass
+        self.func_hook()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
new file mode 100644
index 00000000000000..7f3180e21d8c63
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
@@ -0,0 +1,392 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cases for role makers."""
+
+from __future__ import print_function
+import paddle
+import os
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+
+
+def compare(ref, res, atol, rtol):
+
+    ref = np.array(ref).flatten()
+    res = np.array(res).flatten()
+
+    tmp_ref = ref.astype(np.float)
+    tol = atol + rtol * abs(tmp_ref)
+
+    diff = abs(res - ref)
+
+    indices = np.transpose(np.where(diff > tol))
+    if len(indices) == 0:
+        return True
+    return False
+
+
+def verify_node_count(graph, node_name, target_count):
+    count = 0
+    for node in graph.nodes():
+        if node.name() == node_name:
+            count += 1
+    return count == target_count
+
+
+class MultiFCLayer(paddle.nn.Layer):
+    def __init__(self, hidden, Activation):
+        super(MultiFCLayer, self).__init__()
+        self.linear1 = paddle.nn.Linear(hidden, hidden)
+        self.linear2 = paddle.nn.Linear(hidden, hidden)
+        self.linear3 = paddle.nn.Linear(hidden, hidden)
+
+        self.relu1 = Activation()
+        self.relu2 = Activation()
+        self.relu3 = Activation()
+
+    def forward(self, x, matmul_y, ele_y):
+        output = self.linear1(x)
+        output = self.relu1(output)
+        output = self.linear2(output)
+
+        output1 = paddle.matmul(output, matmul_y)
+        output = self.linear3(output)
+        output = self.relu2(output)
+
+        output = paddle.matmul(output, matmul_y)
+        output = paddle.add(output, ele_y)
+        output = self.relu3(output)
+        output = paddle.add(output, output1)
+        return output
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueFWDBase(unittest.TestCase):
+    def setUp(self):
+        self.batch = 64
+        self.seqlen = 128
+        self.hidden = 768
+
+        paddle.enable_static()
+
+        self.main_prog = paddle.static.Program()
+        self.startup_prog = paddle.static.Program()
+
+        with paddle.static.program_guard(self.main_prog, self.startup_prog):
+            data = paddle.static.data(
+                name="_data",
+                shape=[-1, self.seqlen, self.hidden],
+                dtype='float32')
+            matmul_y = paddle.static.data(
+                name="_matmul_y",
+                shape=[1, self.hidden, self.hidden],
+                dtype='float32')
+            ele_y = paddle.static.data(
+                name="_ele_y", shape=[self.hidden, ], dtype='float32')
+
+            multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0])
+            with paddle.static.amp.fp16_guard():
+                out = multi_layer(data, matmul_y, ele_y)
+                self.loss = paddle.mean(out)
+
+        self.data_arr = np.random.random(
+            (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5
+        self.matmul_y_arr = np.random.random(
+            (1, self.hidden, self.hidden)).astype("float32") - 0.5
+        self.ele_y_arr = np.random.random(
+            (self.hidden, )).astype("float32") - 0.5
+
+        self.place = paddle.CUDAPlace(0)
+        self.exe = paddle.static.Executor(self.place)
+        self.exe.run(self.startup_prog)
+
+        self._pre_test_hooks()
+
+        self.feed = {
+            "_data": self.data_arr,
+            "_matmul_y": self.matmul_y_arr,
+            "_ele_y": self.ele_y_arr
+        }
+        self.reference = self.exe.run(self.main_prog,
+                                      feed=self.feed,
+                                      fetch_list=[self.loss.name])
+
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    def _test_output(self):
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.fuse_gemm_epilogue = True
+        program = paddle.static.CompiledProgram(self.main_prog)
+        program = program.with_data_parallel(
+            loss_name=self.loss.name,
+            build_strategy=build_strategy,
+            places=paddle.static.cuda_places())
+
+        result = self.exe.run(program,
+                              feed=self.feed,
+                              fetch_list=[self.loss.name])
+        self.assertTrue(
+            compare(self.reference, result, self.atol, self.rtol),
+            "[{}] outputs are miss-matched.".format(type(self).__name__))
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue", 3),
+            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        act_fwd_name = self._get_act_type()[1]
+        self.assertTrue(
+            verify_node_count(program._graph, act_fwd_name, 1),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_fwd_name))
+
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu"
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReluFWDFP32(TestFuseGemmEpilogueFWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReluFWDFP16(TestFuseGemmEpilogueReluFWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeluFWDFP32(TestFuseGemmEpilogueFWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.GELU, "gelu"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeluFWDFP16(TestFuseGemmEpilogueGeluFWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueBWDBase(unittest.TestCase):
+    def setUp(self):
+        self.batch = 64
+        self.seqlen = 128
+        self.hidden = 768
+
+        paddle.enable_static()
+
+        self.main_prog = paddle.static.Program()
+        self.startup_prog = paddle.static.Program()
+
+        with paddle.static.program_guard(self.main_prog, self.startup_prog):
+            data = paddle.static.data(
+                name="_data",
+                shape=[-1, self.seqlen, self.hidden],
+                dtype='float32')
+            matmul_y = paddle.static.data(
+                name="_matmul_y",
+                shape=[1, self.hidden, self.hidden],
+                dtype='float32')
+            ele_y = paddle.static.data(
+                name="_ele_y", shape=[self.hidden, ], dtype='float32')
+
+            multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0])
+            with paddle.static.amp.fp16_guard():
+                out = multi_layer(data, matmul_y, ele_y)
+                self.loss = paddle.mean(out)
+                paddle.static.append_backward(loss=self.loss)
+
+        self.data_arr = np.random.random(
+            (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5
+        self.matmul_y_arr = np.random.random(
+            (1, self.hidden, self.hidden)).astype("float32") - 0.5
+        self.ele_y_arr = np.random.random(
+            (self.hidden, )).astype("float32") - 0.5
+
+        self.place = paddle.CUDAPlace(0)
+        self.exe = paddle.static.Executor(self.place)
+        self.exe.run(self.startup_prog)
+
+        self._pre_test_hooks()
+
+        self.feed = {
+            "_data": self.data_arr,
+            "_matmul_y": self.matmul_y_arr,
+            "_ele_y": self.ele_y_arr
+        }
+
+        self.fetch = [
+            self.loss.name,
+            '{}.w_0@GRAD'.format(multi_layer.linear1.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear1.full_name()),
+            '{}.w_0@GRAD'.format(multi_layer.linear2.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear2.full_name()),
+            '{}.w_0@GRAD'.format(multi_layer.linear3.full_name()),
+            '{}.b_0@GRAD'.format(multi_layer.linear3.full_name())
+        ]
+        self.outs_ref = self.exe.run(self.main_prog,
+                                     feed=self.feed,
+                                     fetch_list=self.fetch)
+
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    def _test_output(self):
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.fuse_gemm_epilogue = True
+        program = paddle.static.CompiledProgram(self.main_prog)
+        program = program.with_data_parallel(
+            loss_name=self.loss.name,
+            build_strategy=build_strategy,
+            places=paddle.static.cuda_places())
+
+        outs_res = self.exe.run(program, feed=self.feed, fetch_list=self.fetch)
+
+        for ref, res in zip(self.outs_ref, outs_res):
+            self.assertTrue(
+                compare(ref, res, self.atol, self.rtol),
+                "[{}] output is miss-matched.".format(type(self).__name__))
+
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue", 3),
+            "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        self.assertTrue(
+            verify_node_count(program._graph, "fused_gemm_epilogue_grad", 3),
+            "[{}] The number of fused_gemm_epilogue_grad is miss-matched in the computing graph.".
+            format(type(self).__name__))
+        _, act_fwd_name, act_bwd_name = self._get_act_type()
+        self.assertTrue(
+            verify_node_count(program._graph, act_fwd_name, 1),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_fwd_name))
+        self.assertTrue(
+            verify_node_count(program._graph, act_bwd_name, 2),
+            "[{}] The number of {} is miss-matched in the computing graph.".
+            format(type(self).__name__, act_bwd_name))
+
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu", "relu_grad"
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReLUBWDFP32(TestFuseGemmEpilogueBWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 1e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.ReLU, "relu", "relu_grad"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueReLUBWDFP16(TestFuseGemmEpilogueReLUBWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeLUBWDFP32(TestFuseGemmEpilogueBWDBase):
+    def _pre_test_hooks(self):
+        self.atol = 5e-4
+        self.rtol = 1e-3
+
+    def _get_act_type(self):
+        return paddle.nn.GELU, "gelu", "gelu_grad"
+
+    def test_output(self):
+        self._test_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGeLUBWDFP16(TestFuseGemmEpilogueGeLUBWDFP32):
+    def _pre_test_hooks(self):
+        self.atol = 1e-3
+        self.rtol = 1e-2
+
+        fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog)
+        paddle.static.amp.cast_parameters_to_fp16(
+            self.place, self.main_prog, to_fp16_var_names=fp16_var_list)
+
+        self.data_arr = self.data_arr.astype("float16")
+        self.matmul_y_arr = self.matmul_y_arr.astype("float16")
+        self.ele_y_arr = self.ele_y_arr.astype("float16")
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
new file mode 100644
index 00000000000000..2ea1bf2e9cb810
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+
+
+def get_outputs(DOut, X, Y):
+    DX = np.dot(DOut, Y.T)
+    DY = np.dot(X.T, DOut)
+    DBias = np.sum(DOut, axis=0)
+
+    return DX, DY, DBias
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        DX, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                    self.inputs['Y'])
+        self.outputs = {'DX': DX, 'DY': DY, 'DBias': DBias}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP32(
+        TestFuseGemmEpilogueGradOpDXYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYBiasFP64(
+        TestFuseGemmEpilogueGradOpDXYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        _, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                   self.inputs['Y'])
+        self.outputs = {'DY': DY, 'DBias': DBias}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP32(
+        TestFuseGemmEpilogueGradOpDYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYBiasFP64(
+        TestFuseGemmEpilogueGradOpDYBiasFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        _, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                               self.inputs['Y'])
+        self.outputs = {'DY': DY}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP32(TestFuseGemmEpilogueGradOpDYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDYFP64(TestFuseGemmEpilogueGradOpDYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue_grad"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        DX, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                self.inputs['Y'])
+        self.outputs = {'DX': DX, 'DY': DY}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP32(TestFuseGemmEpilogueGradOpDXYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueGradOpDXYFP64(TestFuseGemmEpilogueGradOpDXYFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
new file mode 100644
index 00000000000000..f826898f9e5dd6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py
@@ -0,0 +1,450 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+
+
+def gelu(x):
+    y_ref = 0.5 * x * (
+        1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+    return y_ref.astype(x.dtype)
+
+
+def relu(x):
+    mask = x > 0
+    return x * mask
+
+
+def get_output(X, Y, bias, act):
+    out = np.dot(X, Y) + bias
+    if act == 'relu':
+        return relu(out)
+    elif act == 'gelu':
+        return gelu(out)
+    else:
+        return out
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {"activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP32(TestFuseGemmEpilogueOpReluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP64(TestFuseGemmEpilogueOpReluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].T, self.inputs['Y'],
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_x': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP32(TestFuseGemmEpilogueOpReluMTMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP64(TestFuseGemmEpilogueOpReluMTMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'].T,
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_y': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP32(TestFuseGemmEpilogueOpReluMMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMTFP64(TestFuseGemmEpilogueOpReluMMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].T, self.inputs['Y'].T,
+                              self.inputs['Bias'], 'relu')
+        }
+        self.attrs = {'trans_x': True, 'trans_y': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP32(TestFuseGemmEpilogueOpReluMTMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMTFP64(TestFuseGemmEpilogueOpReluMTMTFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP16MultiDimX(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((2, 2, 8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].reshape(
+                (-1, 4)), self.inputs['Y'], self.inputs['Bias'],
+                              'relu').reshape((2, 2, 8, 128))
+        }
+        self.attrs = {"activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP32MultiDimX(
+        TestFuseGemmEpilogueOpReluMMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMMFP64MultiDimX(
+        TestFuseGemmEpilogueOpReluMMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP16MultiDimX(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((4, 2, 2, 8)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+        self.outputs = {
+            'Out': get_output(self.inputs['X'].reshape(
+                (4, -1)).T, self.inputs['Y'], self.inputs['Bias'],
+                              'relu').reshape((2, 2, 8, 128))
+        }
+        self.attrs = {'trans_x': True, "activation": 'relu'}
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP32MultiDimX(
+        TestFuseGemmEpilogueOpReluMTMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpReluMTMFP64MultiDimX(
+        TestFuseGemmEpilogueOpReluMTMFP16MultiDimX):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'gelu'}
+
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'gelu')
+        }
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP32(TestFuseGemmEpilogueOpGeluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpGeluMMFP64(TestFuseGemmEpilogueOpGeluMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP16(OpTest):
+    def setUp(self):
+        self.op_type = "fused_gemm_epilogue"
+        self.place = core.CUDAPlace(0)
+        self.init_dtype_type()
+
+        self.inputs = {
+            'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+            'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5,
+            'Bias': np.random.random((128, )).astype(self.dtype) - 0.5
+        }
+
+        self.attrs = {"activation": 'none'}
+
+        self.outputs = {
+            'Out': get_output(self.inputs['X'], self.inputs['Y'],
+                              self.inputs['Bias'], 'none')
+        }
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.atol = 1e-3
+
+    def test_check_output(self):
+        if self.dtype == np.float16 and not core.is_float16_supported(
+                self.place):
+            return
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP32(TestFuseGemmEpilogueOpNoneMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.single
+        self.atol = 1e-6
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFuseGemmEpilogueOpNoneMMFP64(TestFuseGemmEpilogueOpNoneMMFP16):
+    def init_dtype_type(self):
+        self.dtype = np.double
+        self.atol = 1e-6
+
+
+if __name__ == "__main__":
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 31caf4bd6be984..738441a46d377e 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -22,7 +22,7 @@
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_uint16_to_float
 import paddle
 
 
@@ -65,6 +65,50 @@ def verify_output(self, outs):
             "hist: " + str(hist) + " hist2: " + str(hist2))
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestGaussianRandomBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "gaussian_random"
+        self.set_attrs()
+        self.inputs = {}
+        self.use_mkldnn = False
+        self.attrs = {
+            "shape": [123, 92],
+            "mean": self.mean,
+            "std": self.std,
+            "seed": 10,
+            "dtype": paddle.fluid.core.VarDesc.VarType.BF16,
+            "use_mkldnn": self.use_mkldnn
+        }
+        paddle.seed(10)
+
+        self.outputs = {'Out': np.zeros((123, 92), dtype='float32')}
+
+    def set_attrs(self):
+        self.mean = 1.0
+        self.std = 2.
+
+    def test_check_output(self):
+        self.check_output_with_place_customized(
+            self.verify_output, place=core.CUDAPlace(0))
+
+    def verify_output(self, outs):
+        outs = convert_uint16_to_float(outs)
+        self.assertEqual(outs[0].shape, (123, 92))
+        hist, _ = np.histogram(outs[0], range=(-3, 5))
+        hist = hist.astype("float32")
+        hist /= float(outs[0].size)
+        data = np.random.normal(size=(123, 92), loc=1, scale=2)
+        hist2, _ = np.histogram(data, range=(-3, 5))
+        hist2 = hist2.astype("float32")
+        hist2 /= float(outs[0].size)
+        self.assertTrue(
+            np.allclose(
+                hist, hist2, rtol=0, atol=0.05),
+            "hist: " + str(hist) + " hist2: " + str(hist2))
+
+
 class TestMeanStdAreInt(TestGaussianRandomOp):
     def set_attrs(self):
         self.mean = 1
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
new file mode 100755
index 00000000000000..12fb0fa61b005f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -0,0 +1,129 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.profiler as profiler
+
+
+class TestProfiler(unittest.TestCase):
+    def test_profiler(self):
+        def my_trace_back(prof):
+            profiler.export_chrome_tracing('./test_profiler_chrometracing/')(
+                prof)
+            profiler.export_protobuf('./test_profiler_pb/')(prof)
+
+        x_value = np.random.randn(2, 3, 3)
+        x = paddle.to_tensor(
+            x_value, stop_gradient=False, place=paddle.CPUPlace())
+        y = x / 2.0
+        ones_like_y = paddle.ones_like(y)
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof:
+            y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=(1, 2)) as prof:
+            with profiler.RecordEvent(name='test'):
+                y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=0, ready=1, record=1, repeat=1),
+                on_trace_ready=my_trace_back) as prof:
+            y = x / 2.0
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=0, ready=0, record=2, repeat=1),
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(3):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(2):
+                y = x / 2.0
+                prof.step()
+
+        def my_sheduler(num_step):
+            if num_step % 5 < 2:
+                return profiler.ProfilerState.RECORD_AND_RETURN
+            elif num_step % 5 < 3:
+                return profiler.ProfilerState.READY
+            elif num_step % 5 < 4:
+                return profiler.ProfilerState.RECORD
+            else:
+                return profiler.ProfilerState.CLOSED
+
+        def my_sheduler1(num_step):
+            if num_step % 5 < 2:
+                return profiler.ProfilerState.RECORD
+            elif num_step % 5 < 3:
+                return profiler.ProfilerState.READY
+            elif num_step % 5 < 4:
+                return profiler.ProfilerState.RECORD
+            else:
+                return profiler.ProfilerState.CLOSED
+
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(2):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=my_sheduler,
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(5):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=my_sheduler1) as prof:
+            for i in range(5):
+                y = x / 2.0
+                prof.step()
+        prof = None
+        with profiler.Profiler(
+                targets=[profiler.ProfilerTarget.CPU],
+                scheduler=profiler.make_scheduler(
+                    closed=1, ready=1, record=2, repeat=1, skip_first=1),
+                on_trace_ready=my_trace_back) as prof:
+            for i in range(5):
+                y = x / 2.0
+                paddle.grad(outputs=y, inputs=[x], grad_outputs=ones_like_y)
+                prof.step()
+
+        prof.export(path='./test_profiler_pb.pb', format='pb')
+        prof.summary()
+        result = profiler.utils.load_profiler_result('./test_profiler_pb.pb')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
new file mode 100755
index 00000000000000..05e79200354568
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle.profiler.statistic_helper as statistic_helper
+
+
+class TestStatisticHelper(unittest.TestCase):
+    def test_sum_ranges_case1(self):
+        src = [(1, 3), (4, 10), (11, 15)]
+        self.assertEqual(statistic_helper.sum_ranges(src), 12)
+
+    def test_sum_ranges_case2(self):
+        src = [(3, 3), (5, 5), (7, 7)]
+        self.assertEqual(statistic_helper.sum_ranges(src), 0)
+
+    def test_merge_self_ranges_case1(self):
+        src = [(1, 5), (2, 7), (4, 9), (14, 19)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 9), (14, 19)])
+        src = [(4, 9), (14, 19), (1, 5), (2, 7)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 9), (14, 19)])
+
+    def test_merge_self_ranges_case2(self):
+        src = [(1, 1), (2, 3), (4, 7), (5, 12)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)])
+        src = [(5, 12), (1, 1), (2, 3), (4, 7)]
+        dst = statistic_helper.merge_self_ranges(src)
+        self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)])
+
+    def test_merge_ranges_case1(self):
+        src1 = [(1, 2), (5, 7), (9, 14)]
+        src2 = [(1, 2), (4, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+        src1 = []
+        src2 = []
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, [])
+        src1 = [(1, 2), (3, 5)]
+        src2 = []
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, src1)
+        src1 = []
+        src2 = [(1, 2), (3, 5)]
+        dst = statistic_helper.merge_ranges(src1, src2, True)
+        self.assertEqual(dst, src2)
+        src1 = [(3, 4), (1, 2), (17, 19)]
+        src2 = [(6, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)])
+        dst = statistic_helper.merge_ranges(src2, src1)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)])
+        src1 = [(1, 2), (5, 9), (12, 13)]
+        src2 = [(6, 8), (9, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (5, 15)])
+        dst = statistic_helper.merge_ranges(src2, src1)
+        self.assertEqual(dst, [(1, 2), (5, 15)])
+
+    def test_merge_ranges_case2(self):
+        src1 = [(3, 4), (1, 2), (9, 14)]
+        src2 = [(6, 9), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (3, 4), (6, 15)])
+        src2 = [(9, 14), (1, 2), (5, 7)]
+        src1 = [(4, 9), (1, 2), (13, 15)]
+        dst = statistic_helper.merge_ranges(src1, src2)
+        self.assertEqual(dst, [(1, 2), (4, 15)])
+
+    def test_intersection_ranges_case1(self):
+        src1 = [(1, 7), (9, 12), (14, 18)]
+        src2 = [(3, 8), (10, 13), (15, 19)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        src1 = []
+        src2 = []
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [])
+        src1 = [(3, 7), (10, 12)]
+        src2 = [(2, 9), (11, 13), (15, 19)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (11, 12)])
+        dst = statistic_helper.intersection_ranges(src2, src1)
+        self.assertEqual(dst, [(3, 7), (11, 12)])
+
+    def test_intersection_ranges_case2(self):
+        src2 = [(9, 12), (1, 7), (14, 18)]
+        src1 = [(10, 13), (3, 8), (15, 19), (20, 22)]
+        dst = statistic_helper.intersection_ranges(src1, src2)
+        self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)])
+        src2 = [(1, 7), (14, 18), (21, 23)]
+        src1 = [(6, 9), (10, 13)]
+        dst = statistic_helper.intersection_ranges(src1, src2, True)
+        self.assertEqual(dst, [(6, 7)])
+
+    def test_subtract_ranges_case1(self):
+        src1 = [(1, 10), (12, 15)]
+        src2 = [(3, 7), (9, 11)]
+        dst = statistic_helper.subtract_ranges(src1, src2, True)
+        self.assertEqual(dst, [(1, 3), (7, 9), (12, 15)])
+        src1 = [(1, 10), (12, 15)]
+        src2 = []
+        dst = statistic_helper.subtract_ranges(src1, src2, True)
+        self.assertEqual(dst, src1)
+        dst = statistic_helper.subtract_ranges(src2, src1, True)
+        self.assertEqual(dst, src2)
+
+    def test_subtract_ranges_case2(self):
+        src2 = [(12, 15), (1, 10)]
+        src1 = [(9, 11), (3, 7)]
+        dst = statistic_helper.subtract_ranges(src1, src2)
+        self.assertEqual(dst, [(10, 11)])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
new file mode 100644
index 00000000000000..838ccae37cfa5f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -0,0 +1,199 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.profiler as profiler
+
+
+class HostPythonNode:
+    def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
+        self.name = name
+        self.type = type
+        self.start_ns = start_ns
+        self.end_ns = end_ns
+        self.process_id = process_id
+        self.thread_id = thread_id
+        self.children_node = []
+        self.runtime_node = []
+        self.device_node = []
+
+
+class DevicePythonNode:
+    def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
+                 stream_id):
+        self.name = name
+        self.type = type
+        self.start_ns = start_ns
+        self.end_ns = end_ns
+        self.device_id = device_id
+        self.context_id = context_id
+        self.stream_id = stream_id
+
+
+class TestProfilerStatistic(unittest.TestCase):
+    def test_statistic_case1(self):
+        root_node = HostPythonNode('Root Node',
+                                   profiler.TracerEventType.UserDefined, 0,
+                                   float('inf'), 1000, 1001)
+        profilerstep_node = HostPythonNode('ProfileStep#1',
+                                           profiler.TracerEventType.ProfileStep,
+                                           0, 400, 1000, 1001)
+        dataloader_node = HostPythonNode(
+            'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
+        mobilenet_node = HostPythonNode(
+            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+        yolonet_node = HostPythonNode(
+            'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001)
+        backward_node = HostPythonNode('Gradient Backward',
+                                       profiler.TracerEventType.Backward, 120,
+                                       200, 1000, 1001)
+        optimization_node = HostPythonNode(
+            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
+            1000, 1001)
+        conv2d_node = HostPythonNode(
+            'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001)
+        sync_batch_norm_node = HostPythonNode('sync_batch_norm',
+                                              profiler.TracerEventType.Operator,
+                                              60, 100, 1000, 1001)
+        conv2d_infer_shape = HostPythonNode(
+            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
+            30, 1000, 1001)
+        conv2d_compute = HostPythonNode('conv2d::compute',
+                                        profiler.TracerEventType.OperatorInner,
+                                        30, 40, 1000, 1001)
+        conv2d_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
+            1000, 1001)
+        conv2d_MemCpy = HostPythonNode('AsyncMemcpy',
+                                       profiler.TracerEventType.UserDefined, 35,
+                                       40, 1000, 1001)
+        conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy',
+                                           profiler.TracerEventType.CudaRuntime,
+                                           35, 40, 1000, 1001)
+        conv2d_kernel = DevicePythonNode(
+            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0)
+        conv2d_memcpy = DevicePythonNode(
+            'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0)
+        sync_batch_norm_infer_shape = HostPythonNode(
+            'sync_batch_norm::infer_shape',
+            profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001)
+        sync_batch_norm_compute = HostPythonNode(
+            'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner,
+            80, 100, 1000, 1001)
+        sync_batch_norm_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90,
+            1000, 1001)
+        sync_batch_norm_MemCpy = HostPythonNode(
+            'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000,
+            1001)
+        sync_batch_norm_cudaMemCpy = HostPythonNode(
+            'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000,
+            1001)
+        sync_batch_norm_kernel = DevicePythonNode(
+            'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 155,
+            0, 0, 0)
+        sync_batch_norm_memcpy = DevicePythonNode(
+            'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200,
+            0, 0, 1)
+        root_node.children_node.append(profilerstep_node)
+        profilerstep_node.children_node.extend([
+            dataloader_node, mobilenet_node, yolonet_node, backward_node,
+            optimization_node
+        ])
+        mobilenet_node.children_node.append(conv2d_node)
+        yolonet_node.children_node.append(sync_batch_norm_node)
+        conv2d_node.children_node.extend(
+            [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
+        conv2d_compute.runtime_node.append(conv2d_launchkernel)
+        conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy)
+        conv2d_launchkernel.device_node.append(conv2d_kernel)
+        conv2d_cudaMemCpy.device_node.append(conv2d_memcpy)
+        sync_batch_norm_node.children_node.extend([
+            sync_batch_norm_infer_shape, sync_batch_norm_compute,
+            sync_batch_norm_MemCpy
+        ])
+        sync_batch_norm_compute.runtime_node.append(
+            sync_batch_norm_launchkernel)
+        sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy)
+        sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel)
+        sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy)
+        thread_tree = {'thread1001': root_node}
+        extra_info = {
+            'Process Cpu Utilization': '1.02',
+            'System Cpu Utilization': '0.68'
+        }
+        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+                                                                   extra_info)
+        time_range_summary = statistic_data.time_range_summary
+        event_summary = statistic_data.event_summary
+
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.ProfileStep), 400)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Forward), 90)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Backward), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Optimization), 80)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Operator), 55)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.OperatorInner), 45)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.CudaRuntime), 30)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Kernel), 75)
+        self.assertEqual(
+            time_range_summary.get_gpu_range_sum(
+                0, profiler.TracerEventType.Memcpy), 60)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.UserDefined), 15)
+        self.assertEqual(len(event_summary.items), 2)
+        self.assertEqual(len(event_summary.userdefined_items), 0)
+        self.assertEqual(len(event_summary.model_perspective_items), 3)
+        self.assertEqual(len(event_summary.memory_manipulation_items), 1)
+        self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
+        self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].cpu_time, 90)
+        self.assertEqual(
+            event_summary.model_perspective_items['Forward'].gpu_time, 135)
+        self.assertEqual(
+            event_summary.model_perspective_items['Backward'].gpu_time, 0)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15)
+        self.assertEqual(
+            event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60)
+        print(
+            profiler.profiler_statistic._build_table(
+                statistic_data,
+                sorted_by=profiler.SortedKeys.CPUTotal,
+                op_detail=True,
+                thread_sep=False,
+                time_unit='ms'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index e71adae8d9b6eb..f16198817945ab 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -42,6 +42,7 @@ def ref_selu(x,
 class SeluTest(OpTest):
     def setUp(self):
         self.op_type = "selu"
+        self.python_api = paddle.nn.functional.selu
         self.x_shape = [3, 5, 5, 10]
         self.dtype = np.float64
         self.init_x_shape()
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py
new file mode 100644
index 00000000000000..1a6b637e1b45ed
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel_fused_feedforward(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_fused_feedforward.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index 043c5c1651a09a..f210d97362cf06 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -260,5 +260,6 @@ def initTestCase(self):
         self.iou_aware_factor = 0.5
 
 
-if (__name__ == '__main__'):
+if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 1ec1d1527e178a..3f0e4f7a4002a2 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -305,6 +305,7 @@ def test_dygraph(self):
             use_label_smooth=True,
             scale_x_y=1.)
         assert loss is not None
+        assert loss.shape == [2]
         paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
index 8c1ce68e9d0f8b..7a3b4a5a2179a9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 import unittest
 import numpy as np
-import paddle
-import paddle.fluid.core as core
 import sys
 sys.path.append("..")
+
+import paddle
+
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
@@ -158,390 +159,356 @@ def nearest_neighbor_interp3d_np(X,
     return out.astype(X.dtype)
 
 
-class TestNearestInterpOp(XPUOpTest):
-    def setUp(self):
-        self.use_xpu = True
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "nearest_interp_v2"
-        input_np = np.random.random(self.input_shape).astype("float32")
-
-        if self.data_layout == "NCHW" and len(self.input_shape) == 4:
-            in_d = 1
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_d = 1
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.data_layout == "NCDHW" and len(self.input_shape) == 5:
-            in_d = self.input_shape[2]
-            in_h = self.input_shape[3]
-            in_w = self.input_shape[4]
-        else:
-            in_d = self.input_shape[1]
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        scale_d = 0
-        scale_h = 0
-        scale_w = 0
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    scale_d = scale_h = scale_w = float(self.scale)
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                scale_d = scale_w = scale_h = self.scale[0]
-            elif isinstance(self.scale, list) and len(self.scale) > 1:
-                if len(self.scale) == 5:
-                    scale_w = self.scale[2]
-                    scale_h = self.scale[1]
-                    scale_d = self.scale[0]
-                else:
-                    scale_w = self.scale[1]
-                    scale_h = self.scale[0]
+class XPUNearestInterpOpWrapper(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'nearest_interp_v2'
+        self.use_dynamic_create_class = False
 
-            out_h = int(in_h * scale_h)
-            out_w = int(in_w * scale_w)
-            out_d = int(in_d * scale_d)
-        else:
-            if len(self.input_shape) == 5:
-                out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
+    class TestNearestInterpOp(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
 
-        if len(self.input_shape) == 4:
-            output_np = nearest_neighbor_interp_np(
-                input_np, out_h, out_w, scale_h, scale_w, self.out_size,
-                self.actual_shape, self.align_corners, self.data_layout)
-        elif len(self.input_shape) == 5:
-            output_np = nearest_neighbor_interp3d_np(
-                input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
-                self.out_size, self.actual_shape, self.align_corners,
-                self.data_layout)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-        if len(self.input_shape) == 5:
-            self.attrs = {
-                'out_d': self.out_d,
-                'out_h': self.out_h,
-                'out_w': self.out_w,
-                'interp_method': self.interp_method,
-                'align_corners': self.align_corners,
-                'data_layout': self.data_layout
-            }
-        else:
+            self.out_size = None
+            self.actual_shape = None
+            self.data_layout = 'NCHW'
+
+            self.interp_method = 'nearest'
+            self.scale = 0.
+            self.align_corners = True
+
+            self.init_test_case()
+            self.op_type = "nearest_interp_v2"
+            input_np = np.random.random(self.input_shape).astype(self.dtype)
+
+            # in
+            if self.data_layout == "NCHW" and len(self.input_shape) == 4:
+                in_d = 1
+                in_h = self.input_shape[2]
+                in_w = self.input_shape[3]
+            else:
+                in_d = 1
+                in_h = self.input_shape[1]
+                in_w = self.input_shape[2]
+
+            if self.data_layout == "NCDHW" and len(self.input_shape) == 5:
+                in_d = self.input_shape[2]
+                in_h = self.input_shape[3]
+                in_w = self.input_shape[4]
+            else:
+                in_d = self.input_shape[1]
+                in_h = self.input_shape[2]
+                in_w = self.input_shape[3]
+
+            # scale
+            scale_d = 0
+            scale_h = 0
+            scale_w = 0
+            if self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        scale_d = scale_h = scale_w = float(self.scale)
+                        self.scale = [self.scale]
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    scale_d = scale_w = scale_h = self.scale[0]
+                    self.scale = [self.scale[0], self.scale[0]]
+                elif isinstance(self.scale, list) and len(self.scale) > 1:
+                    if len(self.scale) == 5:
+                        scale_w = self.scale[2]
+                        scale_h = self.scale[1]
+                        scale_d = self.scale[0]
+                    else:
+                        scale_w = self.scale[1]
+                        scale_h = self.scale[0]
+
+                out_h = int(in_h * scale_h)
+                out_w = int(in_w * scale_w)
+                out_d = int(in_d * scale_d)
+            else:
+                if len(self.input_shape) == 5:
+                    out_d = self.out_d
+                out_h = self.out_h
+                out_w = self.out_w
+
+            # output_np
+            if len(self.input_shape) == 4:
+                output_np = nearest_neighbor_interp_np(
+                    input_np, out_h, out_w, scale_h, scale_w, self.out_size,
+                    self.actual_shape, self.align_corners, self.data_layout)
+            elif len(self.input_shape) == 5:
+                output_np = nearest_neighbor_interp3d_np(
+                    input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w,
+                    self.out_size, self.actual_shape, self.align_corners,
+                    self.data_layout)
+            self.outputs = {'Out': output_np}
+
+            self.inputs = {'X': input_np}
+            if self.out_size is not None:
+                self.inputs['OutSize'] = self.out_size
+            if self.actual_shape is not None:
+                self.inputs['OutSize'] = self.actual_shape
+
+            if len(self.input_shape) == 5:
+                self.attrs = {
+                    'out_d': self.out_d,
+                    'out_h': self.out_h,
+                    'out_w': self.out_w,
+                    'interp_method': self.interp_method,
+                    'align_corners': self.align_corners,
+                    'data_layout': self.data_layout
+                }
+            else:
+                self.attrs = {
+                    'out_h': self.out_h,
+                    'out_w': self.out_w,
+                    'interp_method': self.interp_method,
+                    'align_corners': self.align_corners,
+                    'data_layout': self.data_layout
+                }
+
+            if self.scale:
+                self.attrs['scale'] = self.scale
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True)
+
+        def init_test_case(self):
+            self.input_shape = [2, 3, 4, 5]
+            self.out_h = 2
+            self.out_w = 2
+            self.out_size = np.array([3, 3]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support 5-dim input_shape
+    class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [4, 1, 1, 7, 8]
+            self.out_d = 1
+            self.out_h = 1
+            self.out_w = 1
+            self.scale = 0.
+            self.align_corners = True
+    """
+
+    class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+
+    class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [1, 1, 32, 64]
+            self.out_h = 64
+            self.out_w = 32
+
+    class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [4, 1, 7, 8]
+            self.out_h = 1
+            self.out_w = 1
+            self.out_size = np.array([2, 2]).astype("int32")
+
+    class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+            self.out_size = np.array([11, 11]).astype("int32")
+
+    class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [1, 1, 32, 64]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([65, 129]).astype("int32")
+
+    class TestNearestNeighborInterpSame(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [2, 3, 32, 64]
+            self.out_h = 32
+            self.out_w = 64
+
+    class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support NHWC data_layout
+    class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [2, 4, 4, 5]
+            self.out_h = 2
+            self.out_w = 2
+            self.scale = 0.
+            self.out_size = np.array([3, 8]).astype("int32")
+            self.align_corners = True
+            self.data_layout = "NHWC"
+    """
+
+    class TestNearestInterpWithoutCorners(TestNearestInterpOp):
+        def set_align_corners(self):
+            self.align_corners = False
+
+    class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 7, 5]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 2.
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 5, 7]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 1.5
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 7, 5]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = [2.0, 3.0]
+            self.out_size = np.array([66, 40]).astype("int32")
+
+    """
+    # case copied form gpu but disabled in xpu: not support 5-dim input_shape
+    class TestNearestNeighbor3DInterp(TestNearestInterpOp):
+        def init_test_case(self):
+            self.interp_method = 'nearest'
+            self.input_shape = [3, 2, 4, 7, 5]
+            self.out_d = 8
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = [4.0, 2.0, 3.0]
+            self.out_size = np.array([8, 66, 40]).astype("int32")
+            self.align_corners = True
+    """
+
+    class TestNearestInterpOp_attr_tensor(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
+
+            self.out_size = None
+            self.actual_shape = None
+
+            self.interp_method = 'nearest'
+            self.scale = 0.
+            self.align_corners = True
+
+            self.init_test_case()
+            self.op_type = "nearest_interp_v2"
+            self.shape_by_1Dtensor = False
+            self.scale_by_1Dtensor = False
             self.attrs = {
-                'out_h': self.out_h,
-                'out_w': self.out_w,
                 'interp_method': self.interp_method,
                 'align_corners': self.align_corners,
-                'data_layout': self.data_layout
             }
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    self.scale = [self.scale]
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                self.scale = [self.scale[0], self.scale[0]]
-            self.attrs['scale'] = self.scale
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support 5-dim input_shape
-class TestNearestNeighborInterpCase1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 1, 7, 8]
-        self.out_d = 1
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.align_corners = True
-"""
-
-
-class TestNearestNeighborInterpCase2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase4(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.out_size = np.array([2, 2]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase5(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = np.array([11, 11]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase6(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([65, 129]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpSame(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 32
-        self.out_w = 64
-        self.scale = 0.
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support NHWC data_layout
-class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 4, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 8]).astype("int32")
-        self.align_corners = True
-        self.data_layout = "NHWC"
-"""
-
-
-class TestNearestInterpWithoutCorners(TestNearestInterpOp):
-    def set_align_corners(self):
-        self.align_corners = False
-
-
-class TestNearestNeighborInterpScale1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 5, 7]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 1.5
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = [2.0, 3.0]
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-"""
-# case copied form gpu but disabled in xpu: not support 5-dim input_shape
-class TestNearestNeighbor3DInterp(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 4, 7, 5]
-        self.out_d = 8
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = [4.0, 2.0, 3.0]
-        self.out_size = np.array([8, 66, 40]).astype("int32")
-        self.align_corners = True
-"""
-
-
-class TestNearestInterpOp_attr_tensor(XPUOpTest):
-    def setUp(self):
-        self.use_xpu = True
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "nearest_interp_v2"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'X': input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    scale_h = scale_w = float(self.scale)
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                scale_w = scale_h = self.scale[0]
-            elif isinstance(self.scale, list) and len(self.scale) > 1:
-                scale_w = self.scale[1]
-                scale_h = self.scale[0]
-            out_h = int(self.input_shape[2] * scale_h)
-            out_w = int(self.input_shape[3] * scale_w)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
-        if self.scale:
-            if isinstance(self.scale, float) or isinstance(self.scale, int):
-                if self.scale > 0:
-                    self.scale = [self.scale]
-            if isinstance(self.scale, list) and len(self.scale) == 1:
-                self.scale = [self.scale[0], self.scale[0]]
-            self.attrs['scale'] = self.scale
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
-                                               self.out_size, self.actual_shape,
-                                               self.align_corners)
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 5, 4, 4]
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.
-        self.out_size = [3, 3]
-        self.align_corners = True
-
-
-# out_size is a tensor list
-class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = [8, 12]
-        self.align_corners = True
-
-
-# out_size is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_1Dtensor = True
 
+            input_np = np.random.random(self.input_shape).astype(self.dtype)
+            self.inputs = {'X': input_np}
+
+            if self.scale_by_1Dtensor:
+                self.inputs['Scale'] = np.array([self.scale]).astype("float32")
+            elif self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        scale_h = scale_w = float(self.scale)
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    scale_w = scale_h = self.scale[0]
+                elif isinstance(self.scale, list) and len(self.scale) > 1:
+                    scale_w = self.scale[1]
+                    scale_h = self.scale[0]
+                out_h = int(self.input_shape[2] * scale_h)
+                out_w = int(self.input_shape[3] * scale_w)
+            else:
+                out_h = self.out_h
+                out_w = self.out_w
+
+            if self.shape_by_1Dtensor:
+                self.inputs['OutSize'] = self.out_size
+            elif self.out_size is not None:
+                size_tensor = []
+                for index, ele in enumerate(self.out_size):
+                    size_tensor.append(("x" + str(index), np.ones(
+                        (1)).astype('int32') * ele))
+                self.inputs['SizeTensor'] = size_tensor
+
+            self.attrs['out_h'] = self.out_h
+            self.attrs['out_w'] = self.out_w
+            if self.scale:
+                if isinstance(self.scale, float) or isinstance(self.scale, int):
+                    if self.scale > 0:
+                        self.scale = [self.scale]
+                if isinstance(self.scale, list) and len(self.scale) == 1:
+                    self.scale = [self.scale[0], self.scale[0]]
+                self.attrs['scale'] = self.scale
+            output_np = nearest_neighbor_interp_np(
+                input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape,
+                self.align_corners)
+            self.outputs = {'Out': output_np}
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True)
+
+        def init_test_case(self):
+            self.input_shape = [2, 5, 4, 4]
+            self.out_h = 3
+            self.out_w = 3
+            self.out_size = [3, 3]
+
+    # out_size is a tensor list
+    class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 3, 9, 6]
+            self.out_h = 12
+            self.out_w = 12
+            self.out_size = [8, 12]
+
+    # out_size is a 1-D tensor
+    class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.out_size = np.array([66, 40]).astype("int32")
+            self.shape_by_1Dtensor = True
+
+    # scale is a 1-D tensor
+    class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
+        def init_test_case(self):
+            self.input_shape = [3, 2, 32, 16]
+            self.out_h = 64
+            self.out_w = 32
+            self.scale = 2.0
+            self.out_size = None
+            self.scale_by_1Dtensor = True
+
+
+support_types = get_xpu_op_support_types('nearest_interp_v2')
+for stype in support_types:
+    create_test_class(globals(), XPUNearestInterpOpWrapper, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
new file mode 100644
index 00000000000000..785549abba8f3a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at #
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.tensor as tensor
+import unittest
+import numpy as np
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from paddle.fluid.framework import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestTrilTriuOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tril_triu'
+        self.use_dynamic_create_class = False
+
+    class TestTrilTriuOp(XPUOpTest):
+        def setUp(self):
+            self.init_dtype()
+            self.initTestCase()
+            self.real_op_type = np.random.choice(['triu', 'tril'])
+            self.real_np_op = getattr(np, self.real_op_type)
+            self.set_xpu()
+            self.op_type = "tril_triu"
+            if self.dtype == np.int32:
+                self.X = np.arange(
+                    1, self.get_Xshape_prod() + 1,
+                    dtype=self.dtype).reshape(self.Xshape)
+            else:
+                self.X = np.random.random(self.Xshape).astype(dtype=self.dtype)
+            self.inputs = {'X': self.X}
+            self.attrs = {
+                'diagonal': self.diagonal,
+                'lower': True if self.real_op_type == 'tril' else False,
+            }
+            self.outputs = {
+                'Out': self.real_np_op(self.X, self.diagonal)
+                if self.diagonal else self.real_np_op(self.X)
+            }
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def get_Xshape_prod(self):
+            ret = 1
+            for v in self.Xshape:
+                ret *= v
+            return ret
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = True
+            self.__class__.op_type = self.real_op_type
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+        def initTestCase(self):
+            self.diagonal = None
+            self.Xshape = (10, 10)
+
+    class TestTrilTriuOp1(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = -3
+            self.Xshape = (5, 5)
+
+    class TestTrilTriuOp2(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 4
+            self.Xshape = (11, 17)
+
+    class TestTrilTriuOp3(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 10
+            self.Xshape = (25, 25)
+
+    class TestTrilTriuOp4(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = -10
+            self.Xshape = (33, 11)
+
+    class TestTrilTriuOp5(TestTrilTriuOp):
+        def initTestCase(self):
+            self.diagonal = 11
+            self.Xshape = (1, 99)
+
+
+class TestTrilTriuOpError(unittest.TestCase):
+    def test_errors1(self):
+        paddle.enable_static()
+        data = fluid.data(shape=(20, 22), dtype='float32', name="data1")
+        op_type = np.random.choice(['triu', 'tril'])
+        errmsg = {
+            "diagonal: TypeError":
+            "diagonal in {} must be a python Int".format(op_type),
+        }
+        expected = list(errmsg.keys())[0]
+        with self.assertRaisesRegex(
+                eval(expected.split(':')[-1]), errmsg[expected]):
+            getattr(tensor, op_type)(x=data, diagonal='2022')
+
+    def test_errors2(self):
+        paddle.enable_static()
+        data = fluid.data(shape=(200, ), dtype='float32', name="data2")
+        op_type = np.random.choice(['triu', 'tril'])
+        errmsg = {
+            "input: ValueError":
+            "x shape in {} must be at least 2-D".format(op_type),
+        }
+        expected = list(errmsg.keys())[0]
+        with self.assertRaisesRegex(
+                eval(expected.split(':')[-1]), errmsg[expected]):
+            getattr(tensor, op_type)(x=data, diagonal=[None])
+
+
+support_types = get_xpu_op_support_types('tril_triu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestTrilTriuOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 94b8bd29b2c19b..f2d41b5e9b1f07 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -46,6 +46,10 @@ def _build_saved_state_dict(state_dict):
             if value.type == core.VarDesc.VarType.VOCAB:
                 save_dict[key] = value.value().get_map_tensor()
             else:
+                if not value.value().get_tensor()._is_initialized():
+                    raise ValueError(
+                        "The saved tensor is not initialized. If you used group sharded, please use save_group_sharded_model."
+                    )
                 save_dict[key] = value.numpy()
             name_table[key] = value.name
         else:
@@ -466,7 +470,9 @@ def ndarray_to_tensor(obj):
 
 def _save_lod_tensor(tensor, file_name):
     if not tensor._is_initialized():
-        raise ValueError("The saved tensor is not initialized.")
+        raise ValueError(
+            "The saved tensor is not initialized. If you used group sharded, please use save_group_sharded_model firstly."
+        )
     if _is_file_path(file_name):
         _seek = core.save_lod_tensor(tensor, file_name)
         # '_seek' is the end position of this tensor in the file.
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index ed668ed124c231..9e78ca6be3f274 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -351,7 +351,6 @@ def _is_list_or_turple_(data):
 
     out_shape = size
     scale = scale_factor
-
     if out_shape is not None and scale is not None:
         raise ValueError("Only one of size or scale_factor should be defined.")
     if out_shape is not None:
@@ -362,6 +361,8 @@ def _is_list_or_turple_(data):
             if in_dynamic_mode():
                 if isinstance(out_shape, Variable):
                     out_shape = list(out_shape.numpy())
+                else:
+                    out_shape = list(out_shape)
                 for i, dim in enumerate(out_shape):
                     if isinstance(dim, Variable):
                         out_shape[i] = dim.numpy()[0]
@@ -1818,7 +1819,6 @@ def fold(x,
     can be calculated as following.
 
     .. math::
-
         H_out &= output_size[0]
         W_out &= output_size[1]
         C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
@@ -1826,21 +1826,21 @@ def fold(x,
     Parameters:
         x(Tensor):                3-D Tensor, input tensor of format [N, C, L],
                                   data type can be float32 or float64
-        output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
+        output_sizes(int|list|tuple):       The size of output size, should be [output_size_h, output_size_w]
                                   or an interger o treated as [o, o].
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
+        paddings(int|list|tuple):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -1859,9 +1859,9 @@ def fold(x,
             import paddle
             import paddle.nn.functional as F
 
-            x = paddle.randn([2,12,9])
-            y = F.fold(x, output_sizes=(4, 4), kernel_sizes=2)
-            # y.shape = [2,3,4,4]
+            x = paddle.randn([2,3*2*2,12])
+            y = F.fold(x, output_sizes=[4, 5], kernel_sizes=2)
+            # y.shape = [2,3,4,5]
 
     """
 
@@ -1872,29 +1872,32 @@ def fold(x,
     assert len(x.shape) == 3, \
             "input should be the format of [N, C, L]"
 
+    def _is_list_or_turple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
     if isinstance(output_sizes, int):
         output_sizes = [output_sizes, output_sizes]
     else:
-        assert isinstance(output_sizes, list) and (len(output_sizes) == 2), \
-            "output_sizes should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(output_sizes) and (len(output_sizes) == 2), \
+            "output_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(kernel_sizes, int):
         kernel_sizes = [kernel_sizes, kernel_sizes]
     else:
-        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(kernel_sizes) and (len(kernel_sizes) == 2), \
+            "kernel_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(strides, int):
         strides = [strides, strides]
     else:
-        assert isinstance(strides, list) and (len(strides) == 2), \
-            "strides should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(strides) and (len(strides) == 2), \
+            "strides should either be an integer or a list/tuple of two integers"
 
     if isinstance(dilations, int):
         dilations = [dilations, dilations]
     else:
-        assert isinstance(dilations, list) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list of two integers"
+        assert _is_list_or_turple_(dilations) and (len(dilations) == 2), \
+            "dilations should either be an integer or a list/tuple of two integers"
 
     if isinstance(paddings, int):
         paddings = [paddings] * 4
@@ -1912,16 +1915,21 @@ def fold(x,
             "Unexpected type of paddings, it should be either an integer or a list"
             "of 2 or 4 integers")
 
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="fold",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs={
-            "output_sizes": output_sizes,
-            "kernel_sizes": kernel_sizes,
-            "strides": strides,
-            "paddings": paddings,
-            "dilations": dilations
-        })
+    if in_dynamic_mode():
+        out = _C_ops.fold(x, "output_sizes", output_sizes, "kernel_sizes",
+                          kernel_sizes, "strides", strides, "paddings",
+                          paddings, "dilations", dilations)
+    else:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+        helper.append_op(
+            type="fold",
+            inputs={"X": x},
+            outputs={"Y": out},
+            attrs={
+                "output_sizes": output_sizes,
+                "kernel_sizes": kernel_sizes,
+                "strides": strides,
+                "paddings": paddings,
+                "dilations": dilations
+            })
     return out
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 19fbcd5b6f8569..dac4cf5f272533 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1565,7 +1565,6 @@ class Fold(Layer):
     can be calculated as following.
 
     .. math::
-
         H_out &= output_size[0]
         W_out &= output_size[1]
         C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
@@ -1573,19 +1572,19 @@ class Fold(Layer):
     Parameters:
         output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
                                   or an interger o treated as [o, o].
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
+        paddings(int|list|tuple):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -1604,10 +1603,10 @@ class Fold(Layer):
             import paddle
             import paddle.nn as nn
 
-            x = paddle.randn([2,12,9])
-            fold = nn.Fold(output_sizes=(4, 4), kernel_sizes=2)
+            x = paddle.randn([2,3*2*2,12])
+            fold = nn.Fold(output_sizes=[4, 5], kernel_sizes=2)
             y = fold(x)
-            # y.shape = [2,3,4,4]
+            # y.shape = [2,3,4,5]
    """
 
     def __init__(self,
diff --git a/python/paddle/profiler/__init__.py b/python/paddle/profiler/__init__.py
new file mode 100644
index 00000000000000..4999e703f2a5a3
--- /dev/null
+++ b/python/paddle/profiler/__init__.py
@@ -0,0 +1,26 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .profiler import ProfilerState, ProfilerTarget
+from .profiler import make_scheduler, export_chrome_tracing, export_protobuf
+from .profiler import Profiler
+from .profiler import TracerEventType
+from .utils import RecordEvent, load_profiler_result
+from .profiler_statistic import SortedKeys
+
+__all__ = [
+    'ProfilerState', 'ProfilerTarget', 'TracerEventType', 'make_scheduler',
+    'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent',
+    'load_profiler_result', 'SortedKeys'
+]
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
new file mode 100644
index 00000000000000..dc637bf983046b
--- /dev/null
+++ b/python/paddle/profiler/profiler.py
@@ -0,0 +1,469 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import socket
+import datetime
+from enum import Enum
+from typing import Any, Callable, Iterable, Optional, Union
+from warnings import warn
+
+import paddle
+from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
+                               TracerEventType)
+
+from .utils import RecordEvent, wrap_optimizers
+from .profiler_statistic import SortedKeys
+
+
+class ProfilerState(Enum):
+    r"""
+    Profiler state that can be specified to control profiler action.
+
+    CLOSED: The profilers are closed.
+    READY:  The profilers are open, but the data will not be recorded.
+            This state is used for reducing overhead influence when profilers start.
+    RECORD: The profilers are open, and the data will be recorded.
+    RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period, 
+            the collected data will be returned.
+    """
+    CLOSED = 0
+    READY = 1
+    RECORD = 2
+    RECORD_AND_RETURN = 3  # the last step of RECORD 
+
+
+class ProfilerTarget(Enum):
+    r"""
+    Target device for profiling.
+    """
+    CPU = 0
+    GPU = 1
+
+
+def make_scheduler(*,
+                   closed: int,
+                   ready: int,
+                   record: int,
+                   repeat: int=0,
+                   skip_first: int=0) -> Callable:
+    r"""
+    Return a scheduler function, which scheduler the state according to the setting.
+    The state transform confirms to:
+
+    (CLOSED)  (CLOSED)    (CLOSED)  (READY)    (RECORD,last RETURN)      (CLOSED)
+    START -> skip_first -> closed -> ready    ->    record       ->      END
+                            |                        |
+                            |                        | (if has_repeated < repeat)
+                            - - - - - - - - - - - -
+    Note that repeat <= 0 means the cycle will continue until the profiler exits.    
+
+    Parameters:
+        closed(int): The number of steps in state ProfilerState.CLOSED.
+        ready(int):  The number of steps in state ProfilerState.READY. 
+        record(int): The number of steps in state ProfilerState.RECORD.    
+        repeat(int): The number of cycles to repeat above state transform.
+        skip_first(int): The number of first steps to drop, not participate in the state transform.
+
+    Returns:
+        A scheduler function, conforms to above state transform setting.
+
+    Examples:
+        1. profiling range [2, 5]
+        batch 0: closed, batch 1: ready, batch [2, 5] record
+        .. code-block:: python
+        make_scheduler(closed=1, ready=1, record=4, repeat=1)
+        2. profiling range [3,6], [9,12], [15,18]...
+        batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
+        .. code-block:: python
+        make_scheduler(closed=1, ready=1, record=4, skip_first=1)
+    """
+
+    def getScheduleState(step: int) -> ProfilerState:
+        assert step >= 0
+        if step < skip_first:  # within skip_first, just skip
+            return ProfilerState.CLOSED
+        step = step - skip_first
+        period_steps = closed + ready + record
+        has_repeated = step // period_steps
+        if repeat > 0 and has_repeated >= repeat:  # the period has repeated repeat times, return CLOSED state
+            return ProfilerState.CLOSED
+        mod_step = step % period_steps
+        if mod_step < closed:
+            return ProfilerState.CLOSED
+        elif mod_step >= closed and mod_step < closed + ready:
+            return ProfilerState.READY
+        else:
+            if mod_step < period_steps - 1:
+                return ProfilerState.RECORD
+            else:
+                return ProfilerState.RECORD_AND_RETURN
+    assert closed >= 0 and ready >= 0 and record > 0 and \
+             repeat >= 0 and skip_first >= 0, "Invalid profiler scheduler arguments"
+    if ready == 0:
+        warn("Profiler will record data after enabling profiler immediately, \
+          some data collected at the beginning of profiling may be 'noisy' because of overhead."
+             )
+    return getScheduleState
+
+
+def _default_state_scheduler(step: int):
+    r"""
+    A default state scheduler, keep recording from the begining of the profiler until ending.
+    """
+    return ProfilerState.RECORD
+
+
+def export_chrome_tracing(dir_name: str,
+                          worker_name: Optional[str]=None) -> Callable:
+    r"""
+    Return a callable, used for outputing tracing data to chrome tracing format file.
+    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    if worker_name is not set, the default name is [hostname]_[pid].
+
+    Parameters:
+        dir_name(str): Directory to save profiling data.
+        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (3, 10),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+    """
+    if not os.path.exists(dir_name):
+        try:
+            os.makedirs(dir_name, exist_ok=True)
+        except Exception:
+            raise RuntimeError(
+                "Can not create directory '{}' for saving profiling results.".
+                format(dir_name))
+
+    def handle_fn(prof):
+        nonlocal worker_name
+        if not worker_name:
+            worker_name = "host_{}pid_{}".format(socket.gethostname(),
+                                                 str(os.getpid()))
+        now = datetime.datetime.now()
+        filename = '{}_time_{}.paddle_trace.json'.format(
+            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f'))
+        prof.export(os.path.join(dir_name, filename), "json")
+
+    return handle_fn
+
+
+def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
+    r"""
+    Return a callable, used for outputing tracing data to protobuf file.
+    The output file will be saved in directory 'dir_name', and file name will be set as worker_name.
+    if worker_name is not set, the default name is [hostname]_[pid].
+
+    Parameters:
+        dir_name(str): Directory to save profiling data.
+        worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid].
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (3, 10),
+                            on_trace_ready = profiler.export_protobuf('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+    """
+    if not os.path.exists(dir_name):
+        try:
+            os.makedirs(dir_name, exist_ok=True)
+        except Exception:
+            raise RuntimeError(
+                "Can not create directory '{}' for saving profiling results.".
+                format(dir_name))
+
+    def handle_fn(prof):
+        nonlocal worker_name
+        if not worker_name:
+            worker_name = "host_{}pid_{}".format(socket.gethostname(),
+                                                 str(os.getpid()))
+        now = datetime.datetime.now()
+        filename = '{}_time_{}.paddle_trace.pb'.format(
+            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f'))
+        prof.export(os.path.join(dir_name, filename), "pb")
+
+    return handle_fn
+
+
+def _get_supported_targets() -> Iterable[ProfilerTarget]:
+    r"""
+    Get the current supported profiler target in the system.
+    """
+    if paddle.device.is_compiled_with_cuda():
+        return [ProfilerTarget.CPU, ProfilerTarget.GPU]
+    return [ProfilerTarget.CPU]
+
+
+class Profiler:
+    r"""
+    Profiler context manager, user interface to manage profile process.
+
+    Parameters:
+        targets (iterable): list of tracing targets, currently supported values:
+        ``paddle.profiler.ProfilerTarget.CPU``,
+        ``paddle.profiler.ProfilerTarget.GPU``.
+        scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``. 
+            If not provided, the default sheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
+            which means profiling range [start_batch, end_batch).
+        on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing.
+            This callable object will be called when ``sheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
+            
+    Examples:
+        1. profiling range [2, 5)
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = (2, 5),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+        2. profiling range [2,4], [7, 9], [11,13]
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
+                                        profiler.ProfilerTarget.GPU],
+                            scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
+                            on_trace_ready = profiler.export_chrome_tracing('./log')
+                            ) as p:
+            for iter in range(N):
+            train()
+            p.step()
+        3. Use profiler without context manager, and use default parameters
+        .. code-block:: python
+        import paddle.profiler as profiler
+        p = profiler.Profiler()
+        p.start()
+        for iter in range(N):
+            train()
+            p.step()
+        p.stop()
+        p.summary()
+    """
+
+    def __init__(
+            self,
+            *,
+            targets: Optional[Iterable[ProfilerTarget]]=None,
+            scheduler: Union[Callable[[int], ProfilerState], tuple, None]=None,
+            on_trace_ready: Optional[Callable[..., Any]]=None):
+        supported_targets = _get_supported_targets()
+        if targets:
+            self.targets = set(targets)
+            for target in targets:
+                if target not in supported_targets:
+                    self.targets.remove(target)
+                    warn("Profiling {} is not supported in current context.".
+                         format(target))
+        else:
+            self.targets = supported_targets
+        profileoption = ProfilerOptions()
+        if ProfilerTarget.CPU in self.targets:
+            profileoption.trace_switch |= 1
+        if ProfilerTarget.GPU in self.targets:
+            profileoption.trace_switch |= (1 << 1)
+        wrap_optimizers()
+        self.profiler = _Profiler.create(profileoption)
+        if callable(scheduler):
+            self.scheduler = scheduler
+        elif isinstance(scheduler, (tuple, list)):
+            assert len(scheduler) == 2 and scheduler[1] > scheduler[0]
+            start_batch, end_batch = scheduler
+            start_batch = max(start_batch, 0)
+            if start_batch >= 1:
+                self.scheduler = make_scheduler(
+                    closed=max(start_batch - 1, 0),
+                    ready=1,
+                    record=(end_batch - start_batch),
+                    repeat=1)
+            else:
+                self.scheduler = make_scheduler(
+                    closed=0,
+                    ready=0,
+                    record=(end_batch - start_batch),
+                    repeat=1)
+        else:
+            self.scheduler = _default_state_scheduler
+
+        if on_trace_ready == None:
+            self.on_trace_ready = export_chrome_tracing('./profiler_log/')
+        else:
+            self.on_trace_ready = on_trace_ready
+        self.step_num = 0
+        self.previous_state = ProfilerState.CLOSED
+        self.current_state = self.scheduler(self.step_num)
+        self.record_event = None
+        self.profiler_result = None
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
+
+    def start(self):
+        r'''
+        Start profiler and enter the first profiler step(0).
+        State transformed from CLOSED to self.current_state and trigger corresponding action. 
+        '''
+        # CLOSED -> self.current_state
+        if self.current_state == ProfilerState.READY:
+            self.profiler.prepare()
+        elif self.current_state == ProfilerState.RECORD:
+            self.profiler.prepare()
+            self.profiler.start()
+        elif self.current_state == ProfilerState.RECORD_AND_RETURN:
+            self.profiler.prepare()
+            self.profiler.start()
+        self.record_event = RecordEvent(
+            name="ProfileStep#{}".format(self.step_num),
+            event_type=TracerEventType.ProfileStep)
+        self.record_event.begin()
+
+    def stop(self):
+        r'''
+        Stop profiler and State transformed from self.current_state to CLOSED.
+        Trigger corresponding action and post-process profiler result using self.on_trace_ready if result exists.
+        '''
+        # self.current_state -> CLOSED
+        # In this situation, RECORD state is regarded as RECORD_AND_RETURN
+        if self.record_event:
+            self.record_event.end()
+            self.record_event = None
+        if self.current_state == ProfilerState.READY:
+            warn(
+                "Inproper Profiler state transform: READY->CLOSED, profiler will start and stop without saving data"
+            )
+            self.profiler.start()
+            self.profiler.stop()
+        if self.current_state == ProfilerState.RECORD or self.current_state == ProfilerState.RECORD_AND_RETURN:
+            self.profiler_result = self.profiler.stop()
+            if self.on_trace_ready:
+                self.on_trace_ready(self)
+
+    def step(self):
+        r"""
+        Signals the profiler that the next profiling step has started.
+        Get the new ProfilerState and trigger corresponding action.
+        """
+        if self.record_event:
+            self.record_event.end()
+            self.record_event = None
+        self.previous_state = self.current_state
+        self.step_num += 1
+        self.current_state = self.scheduler(self.step_num)
+        self._trigger_action()
+        self.record_event = RecordEvent(
+            name="ProfileStep#{}".format(self.step_num),
+            event_type=TracerEventType.ProfileStep)
+        self.record_event.begin()
+
+    def _trigger_action(self):
+        if self.previous_state == ProfilerState.CLOSED:
+            if self.current_state == ProfilerState.READY:  # CLOSED -> READY
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD:  # CLOSED -> RECORD
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # CLOSED -> RECORD_AND_RETURN
+                self.profiler.prepare()
+                self.profiler.start()
+
+        elif self.previous_state == ProfilerState.READY:
+            if self.current_state == ProfilerState.CLOSED:  # READY -> CLOSED
+                warn(
+                    "Improper schedule: READY->CLOSED, profiler will start and stop without saving data"
+                )
+                self.profiler.start()
+                self.profiler.stop()
+            if self.current_state == ProfilerState.RECORD:  # READY -> RECORD
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # READY -> RECORD_AND_RETURN
+                self.profiler.start()
+
+        elif self.previous_state == ProfilerState.RECORD:
+            if self.current_state == ProfilerState.CLOSED:  # RECORD -> CLOSED
+                warn(
+                    "Improper schedule: RECORD->CLOSED, profiler will not saving data"
+                )
+                self.profiler.stop()
+
+            if self.current_state == ProfilerState.READY:  # RECORD -> READY
+                warn(
+                    "Improper schedule: RECORD->READY, profiler will stop and re-prepare"
+                )
+                self.profiler.stop()
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # RECORD -> RECORD_AND_RETURN
+                pass
+
+        else:
+            assert self.previous_state == ProfilerState.RECORD_AND_RETURN
+            if self.current_state == ProfilerState.CLOSED:  # RECORD_AND_RETURN -> CLOSED
+                self.profiler_result = self.profiler.stop()
+            if self.current_state == ProfilerState.READY:  # RECORD_AND_RETURN -> READY
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+            if self.current_state == ProfilerState.RECORD:  # RECORD_AND_RETURN -> RECORD
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # RECORD_AND_RETURN -> RECORD_AND_RETURN
+                self.profiler_result = self.profiler.stop()
+                self.profiler.prepare()
+                self.profiler.start()
+            if self.on_trace_ready:
+                self.on_trace_ready(self)
+
+    def export(self, path="", format="json"):
+        r"""
+        Exports the tracing data in Chrome tracing data format.
+        """
+        if self.profiler_result:
+            self.profiler_result.save(path, format)
+
+    def summary(self,
+                sorted_by=SortedKeys.CPUTotal,
+                op_detail=True,
+                thread_sep=False,
+                time_unit='ms'):
+        r"""
+        Print the Summary table.
+
+        Parameters:
+            sorted_by: how to rank the op table items.
+            detail: expand each operator detail information.
+            thread_sep: print op table each thread.
+            time_unit: can be chosen form ['s', 'ms', 'us', 'ns']
+        """
+        pass
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
new file mode 100755
index 00000000000000..7400f21e91365e
--- /dev/null
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -0,0 +1,824 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+from enum import Enum
+
+from paddle.fluid.core import TracerEventType
+
+from .statistic_helper import *
+
+_AllTracerEventType = [
+    TracerEventType.Operator, TracerEventType.Dataloader,
+    TracerEventType.ProfileStep, TracerEventType.CudaRuntime,
+    TracerEventType.Kernel, TracerEventType.Memcpy, TracerEventType.Memset,
+    TracerEventType.UserDefined, TracerEventType.OperatorInner,
+    TracerEventType.Forward, TracerEventType.Backward,
+    TracerEventType.Optimization, TracerEventType.Communication,
+    TracerEventType.PythonOp, TracerEventType.PythonUserDefined
+]
+
+_CommunicationOpName = ['reduce', 'broadcast', 'rpc']
+
+
+class SortedKeys(Enum):
+    r"""
+    Sorted keys for printing summary table.
+    """
+    CPUTotal = 0
+    CPUAvg = 1
+    CPUMax = 2
+    CPUMin = 3
+    GPUTotal = 4
+    GPUAvg = 5
+    GPUMax = 6
+    GPUMin = 7
+
+
+class HostStatisticNode:
+    r'''
+    Wrap original node for calculating statistic metrics.
+    '''
+
+    def __init__(self, hostnode):
+        self.hostnode = hostnode
+        self.children_node = []
+        self.runtime_node = []
+        self.cpu_time = 0
+        self.self_cpu_time = 0
+        self.gpu_time = 0
+        self.self_gpu_time = 0
+
+    def cal_statistic(self):
+        for child in self.children_node:
+            child.cal_statistic()
+        for rt in self.runtime_node:
+            rt.cal_statistic()
+
+        self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
+        for child in self.children_node:
+            self.gpu_time += child.gpu_time
+            self.self_cpu_time -= (child.end_ns - child.start_ns)
+        for rt in self.runtime_node:
+            self.self_cpu_time -= (rt.end_ns - rt.start_ns)
+            self.gpu_time += rt.gpu_time
+            self.self_gpu_time += rt.gpu_time
+        for device in self.hostnode.device_node:
+            self.gpu_time += (device.end_ns - device.start_ns)
+            self.self_gpu_time += (device.end_ns - device.start_ns)
+
+    @property
+    def end_ns(self):
+        return self.hostnode.end_ns
+
+    @property
+    def start_ns(self):
+        return self.hostnode.start_ns
+
+    def __getattr__(self, name):
+        return getattr(self.hostnode, name)
+
+
+def traverse_tree(nodetrees):
+    results = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        threadlist = results[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+    return results
+
+
+def wrap_tree(nodetrees):
+    '''
+    Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics.
+    '''
+    node_statistic_tree = {}
+    results = collections.defaultdict(list)
+    newresults = collections.defaultdict(list)
+    for thread_id, rootnode in nodetrees.items():
+        stack = []
+        stack.append(rootnode)
+        root_statistic_node = HostStatisticNode(rootnode)
+        newstack = []
+        newstack.append(root_statistic_node)
+        node_statistic_tree[thread_id] = root_statistic_node
+        threadlist = results[thread_id]
+        newthreadlist = newresults[thread_id]
+        while stack:
+            current_node = stack.pop()
+            threadlist.append(current_node)
+            current_statistic_node = newstack.pop()
+            newthreadlist.append(current_statistic_node)
+            for childnode in current_node.children_node:
+                stack.append(childnode)
+                child_statistic_node = HostStatisticNode(childnode)
+                current_statistic_node.children_node.append(
+                    child_statistic_node)
+                newstack.append(child_statistic_node)
+            for runtimenode in current_node.runtime_node:
+                runtime_statistic_node = HostStatisticNode(runtimenode)
+                current_statistic_node.runtime_node.append(
+                    runtime_statistic_node)
+    # recursive calculate node statistic values
+    for thread_id, root_statistic_node in node_statistic_tree.items():
+        root_statistic_node.cal_statistic()
+
+    return node_statistic_tree, newresults
+
+
+class TimeRangeSummary:
+    r"""
+    Analyse time ranges for each TracerEventType, and summarize the time.
+    """
+
+    def __init__(self):
+        self.CPUTimeRange = collections.defaultdict(list)
+        self.GPUTimeRange = collections.defaultdict(
+            lambda: collections.defaultdict(list)
+        )  # GPU events should be divided into different devices
+        self.CPUTimeRangeSum = collections.defaultdict(int)
+        self.GPUTimeRangeSum = collections.defaultdict(
+            lambda: collections.defaultdict(int))
+        self.call_times = collections.defaultdict(int)
+
+    def parse(self, nodetrees):
+        r"""
+        Analysis node trees in profiler result, and get time range for different tracer event type.
+        """
+        thread2hostnodes = traverse_tree(nodetrees)
+        for threadid, hostnodes in thread2hostnodes.items():
+            CPUTimeRange = collections.defaultdict(list)
+            GPUTimeRange = collections.defaultdict(
+                lambda: collections.defaultdict(lambda: collections.defaultdict(list))
+            )  # device_id/type/stream_id
+            for hostnode in hostnodes[1:]:  #skip root node
+                CPUTimeRange[hostnode.type].append(
+                    (hostnode.start_ns, hostnode.end_ns))
+                self.call_times[hostnode.type] += 1
+                if hostnode.type == TracerEventType.Operator and any([
+                        name in hostnode.name for name in _CommunicationOpName
+                ]):  # special case, communication op
+                    CPUTimeRange[TracerEventType.Communication].append(
+                        (hostnode.start_ns, hostnode.end_ns))
+                    self.call_times[TracerEventType.Communication] += 1
+                is_communication_node = (
+                    hostnode.type == TracerEventType.Communication
+                ) or (hostnode.type == TracerEventType.Operator and any(
+                    [name in hostnode.name for name in _CommunicationOpName]))
+                for runtimenode in hostnode.runtime_node:
+                    CPUTimeRange[runtimenode.type].append(
+                        (runtimenode.start_ns, runtimenode.end_ns))
+                    self.call_times[runtimenode.type] += 1
+                    for devicenode in runtimenode.device_node:
+                        GPUTimeRange[devicenode.device_id][devicenode.type][
+                            devicenode.stream_id].append(
+                                (devicenode.start_ns, devicenode.end_ns))
+                        self.call_times[devicenode.type] += 1
+                        if is_communication_node:  # gpu activity for communication node
+                            GPUTimeRange[devicenode.device_id][
+                                TracerEventType.Communication][
+                                    devicenode.stream_id].append((
+                                        devicenode.start_ns, devicenode.end_ns))
+                            self.call_times[TracerEventType.Communication] += 1
+
+            for event_type, time_ranges in CPUTimeRange.items():
+                time_ranges = merge_self_ranges(time_ranges, is_sorted=False)
+                self.CPUTimeRange[event_type] = merge_ranges(
+                    self.CPUTimeRange[event_type], time_ranges, is_sorted=True)
+            for device_id, device_time_ranges in GPUTimeRange.items():
+                for event_type, event_time_ranges in device_time_ranges.items():
+                    for stream_id, time_ranges in event_time_ranges.items():
+                        time_ranges = merge_self_ranges(
+                            time_ranges, is_sorted=False)
+                        self.GPUTimeRange[device_id][event_type] = merge_ranges(
+                            self.GPUTimeRange[device_id][event_type],
+                            time_ranges,
+                            is_sorted=True)
+
+        for event_type, time_ranges in self.CPUTimeRange.items():
+            self.CPUTimeRangeSum[event_type] = sum_ranges(time_ranges)
+        for device_id, device_time_ranges in self.GPUTimeRange.items():
+            for event_type, time_ranges in device_time_ranges.items():
+                self.GPUTimeRangeSum[device_id][event_type] = sum_ranges(
+                    time_ranges)
+
+    def get_gpu_devices(self):
+        return self.GPUTimeRange.keys()
+
+    def get_gpu_range_sum(self, device_id, event_type):
+        return self.GPUTimeRangeSum[device_id][event_type]
+
+    def get_cpu_range_sum(self, event_type):
+        return self.CPUTimeRangeSum[event_type]
+
+
+class EventSummary:
+    r"""
+    Analyse operator event in profiling data, correlate with its device event.
+    """
+
+    class DeviceItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.gpu_time = 0
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_item(self, node):
+            self.call += 1
+            self.add_gpu_time(node.end_ns - node.start_ns)
+
+    class OperatorItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.cpu_time = 0
+            self.gpu_time = 0
+            self.max_cpu_time = 0
+            self.min_cpu_time = float('inf')
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+            self.devices = {}
+            self.operator_inners = {}
+
+        @property
+        def avg_cpu_time(self):
+            return self.cpu_time / self.call
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_cpu_time(self, time):
+            if time > self.max_cpu_time:
+                self.max_cpu_time = time
+            if time < self.min_cpu_time:
+                self.min_cpu_time = time
+            self.cpu_time += time
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_call(self):
+            self.call += 1
+
+        def add_item(self, node):
+            self.add_call()
+            self.add_cpu_time(node.cpu_time)
+            self.add_gpu_time(node.gpu_time)
+            for child in node.children_node:
+                if child.name not in self.operator_inners:
+                    self.operator_inners[
+                        child.name] = EventSummary.OperatorItem(child.name)
+                self.operator_inners[child.name].add_item(child)
+
+            for runtimenode in node.runtime_node:
+                for devicenode in runtimenode.device_node:
+                    if devicenode.name not in self.devices:
+                        self.devices[devicenode.name] = EventSummary.DeviceItem(
+                            devicenode.name)
+                    self.devices[devicenode.name].add_item(devicenode)
+
+    class GeneralItem:
+        def __init__(self, name):
+            self.name = name
+            self.call = 0
+            self.cpu_time = 0
+            self.max_cpu_time = 0
+            self.min_cpu_time = float('inf')
+            self.gpu_time = 0
+            self.max_gpu_time = 0
+            self.min_gpu_time = float('inf')
+
+        @property
+        def avg_cpu_time(self):
+            return self.cpu_time / self.call
+
+        @property
+        def avg_gpu_time(self):
+            return self.gpu_time / self.call
+
+        def add_cpu_time(self, time):
+            if time > self.max_cpu_time:
+                self.max_cpu_time = time
+            if time < self.min_cpu_time:
+                self.min_cpu_time = time
+            self.cpu_time += time
+
+        def add_gpu_time(self, time):
+            if time > self.max_gpu_time:
+                self.max_gpu_time = time
+            if time < self.min_gpu_time:
+                self.min_gpu_time = time
+            self.gpu_time += time
+
+        def add_call(self):
+            self.call += 1
+
+        def add_item(self, node):
+            self.add_call()
+            self.add_cpu_time(node.cpu_time)
+            self.add_gpu_time(node.gpu_time)
+
+    def __init__(self):
+        self.items = {}  # for operator summary
+        self.thread_items = collections.defaultdict(
+            dict)  # for operator summary
+        self.userdefined_items = {}  # for userdefined summary
+        self.userdefined_thread_items = collections.defaultdict(
+            dict)  # for userdefined summary
+        self.model_perspective_items = {}  # for model summary
+        self.memory_manipulation_items = {}  # for memory manipulation summary
+
+    def parse(self, nodetrees):
+        r"""
+        Analysis operator event in the nodetress.
+        """
+        node_statistic_trees, thread2host_statistic_nodes = wrap_tree(nodetrees)
+        for threadid, host_statistic_nodes in thread2host_statistic_nodes.items(
+        ):
+            for host_statistic_node in host_statistic_nodes[
+                    1:]:  #skip root node
+                if host_statistic_node.type == TracerEventType.Operator:
+                    self.add_operator_item(host_statistic_node)
+                if host_statistic_node.type == TracerEventType.UserDefined\
+                    or host_statistic_node.type == TracerEventType.PythonUserDefined:
+                    if 'memcpy' in host_statistic_node.name.lower() or 'memorycopy' in host_statistic_node.name.lower()\
+                        or 'memset' in host_statistic_node.name.lower():
+                        self.add_memory_manipulation_item(host_statistic_node)
+                    else:
+                        self.add_userdefined_item(host_statistic_node)
+
+        for threadid, root_statistic_node in node_statistic_trees.items():
+            deque = collections.deque()
+            deque.append(root_statistic_node)
+            while deque:
+                current_node = deque.popleft()
+                for child in current_node.children_node:
+                    if child.type == TracerEventType.Forward or child.type == TracerEventType.Dataloader\
+                        or child.type == TracerEventType.Backward or child.type == TracerEventType.Optimization:
+                        self.add_model_perspective_item(
+                            child)  #find first model perspective node
+                    else:
+                        deque.append(child)
+
+    def add_operator_item(self, operator_node):
+        if operator_node.name not in self.items:
+            self.items[operator_node.name] = EventSummary.OperatorItem(
+                operator_node.name)
+
+        self.items[operator_node.name].add_item(operator_node)
+
+        if operator_node.name not in self.thread_items[operator_node.thread_id]:
+            self.thread_items[operator_node.thread_id][
+                operator_node.name] = EventSummary.OperatorItem(
+                    operator_node.name)
+        self.thread_items[operator_node.thread_id][operator_node.name].add_item(
+            operator_node)
+
+    def add_userdefined_item(self, userdefined_node):
+        if userdefined_node.name not in self.userdefined_items:
+            self.userdefined_items[
+                userdefined_node.name] = EventSummary.GeneralItem(
+                    userdefined_node.name)
+
+        self.userdefined_items[userdefined_node.name].add_item(userdefined_node)
+
+        if userdefined_node.name not in self.userdefined_thread_items[
+                userdefined_node.thread_id]:
+            self.userdefined_thread_items[userdefined_node.thread_id][
+                userdefined_node.name] = EventSummary.GeneralItem(
+                    userdefined_node.name)
+        self.userdefined_thread_items[userdefined_node.thread_id][
+            userdefined_node.name].add_item(userdefined_node)
+
+    def add_memory_manipulation_item(self, memory_manipulation_node):
+        if memory_manipulation_node.name not in self.memory_manipulation_items:
+            self.memory_manipulation_items[
+                memory_manipulation_node.name] = EventSummary.GeneralItem(
+                    memory_manipulation_node.name)
+        self.memory_manipulation_items[memory_manipulation_node.name].add_item(
+            memory_manipulation_node)
+
+    def add_model_perspective_item(self, model_perspective_node):
+        if model_perspective_node.type == TracerEventType.Forward:
+            name = 'Forward'
+        elif model_perspective_node.type == TracerEventType.Backward:
+            name = 'Backward'
+        elif model_perspective_node.type == TracerEventType.Optimization:
+            name = 'Optimization'
+        elif model_perspective_node.type == TracerEventType.Dataloader:
+            name = 'Dataloader'
+        else:
+            return
+        if name not in self.model_perspective_items:
+            self.model_perspective_items[name] = EventSummary.GeneralItem(name)
+        self.model_perspective_items[name].add_item(model_perspective_node)
+
+
+class StatisticData:
+    r"""
+    Hold all analysed results.
+    """
+
+    def __init__(self, node_trees, extra_info):
+        self.node_trees = node_trees
+        self.extra_info = extra_info
+        self.time_range_summary = TimeRangeSummary()
+        self.event_summary = EventSummary()
+        self.time_range_summary.parse(node_trees)
+        self.event_summary.parse(node_trees)
+
+
+def _build_table(statistic_data,
+                 sorted_by=SortedKeys.CPUTotal,
+                 op_detail=True,
+                 thread_sep=False,
+                 time_unit='ms',
+                 row_limit=100,
+                 max_src_column_width=75):
+    """Prints a summary of events."""
+    # format table row
+    SPACING_SIZE = 2
+    row_format_list = [""]
+    header_sep_list = [""]
+    line_length_list = [-SPACING_SIZE]
+
+    def add_column(padding, text_dir='<'):
+        row_format_list[0] += '{: ' + text_dir + str(padding) + '}' + (
+            ' ' * SPACING_SIZE)
+        header_sep_list[0] += '-' * padding + (' ' * SPACING_SIZE)
+        line_length_list[0] += padding + SPACING_SIZE
+
+    def add_title(padding, text):
+        left_length = padding - len(text)
+        half = left_length // 2
+        return '-' * half + text + '-' * (left_length - half)
+
+    result = []
+
+    def append(s):
+        result.append(s)
+        result.append('\n')
+
+    def format_time(time, unit='ms', indent=0):
+        r"""
+        Transform time in ns to time in unit.
+        """
+        if time == float('inf'):
+            return '-'
+        else:
+            result = float(time)
+            if unit == 's':
+                result /= 1e9
+            elif unit == 'ms':
+                result /= 1e6
+            elif unit == 'us':
+                result /= 1e3
+            return '{}{:.2f}'.format(' ' * indent, result)
+
+    def format_ratio(ratio, indent=0):
+        r"""
+        Transform ratio within [0, 1] to percentage presentation.
+        """
+        return '{}{:.2f}'.format(' ' * indent, ratio * 100)
+
+    total_time = statistic_data.time_range_summary.get_cpu_range_sum(
+        TracerEventType.ProfileStep)
+    ###### Print Device Summary ######
+    headers = ['Device', 'Utilization (%)']
+    name_column_width = 30
+    DEFAULT_COLUMN_WIDTH = 20
+    add_column(name_column_width)
+    for _ in headers[1:]:
+        add_column(DEFAULT_COLUMN_WIDTH)
+
+    row_format = row_format_list[0]
+    header_sep = header_sep_list[0]
+    line_length = line_length_list[0]
+
+    # construct table string
+
+    append(add_title(line_length, "Device Summary"))
+    append('Time unit: {}'.format(time_unit))
+    append(header_sep)
+    append(row_format.format(*headers))
+    append(header_sep)
+    row_values = [
+        'CPU(Process)', format_ratio(
+            float(statistic_data.extra_info['Process Cpu Utilization']))
+    ]
+    append(row_format.format(*row_values))
+    row_values = [
+        'CPU(System)', format_ratio(
+            float(statistic_data.extra_info['System Cpu Utilization']))
+    ]
+    append(row_format.format(*row_values))
+    for gpu_name in statistic_data.time_range_summary.get_gpu_devices():
+        gpu_time = float(
+            statistic_data.time_range_summary.get_gpu_range_sum(
+                gpu_name, TracerEventType.Kernel))
+        utilization = gpu_time / total_time
+        row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)]
+        append(row_format.format(*row_values))
+
+    append(header_sep)
+    append(
+        "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n"
+        "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n"
+        "GPU Utilization = Current process GPU time / elapsed time")
+    append('-' * line_length)
+    append('')
+    append('')
+
+    if total_time == 0:
+        return ''.join(result)
+
+    ###### Print Overview Summary ######
+    headers = ['Event Type', 'CPU Time', 'Ratio (%)']
+    row_format_list = [""]
+    header_sep_list = [""]
+    line_length_list = [-SPACING_SIZE]
+
+    DEFAULT_COLUMN_WIDTH = 25
+    for _ in headers:
+        add_column(DEFAULT_COLUMN_WIDTH)
+
+    row_format = row_format_list[0]
+    header_sep = header_sep_list[0]
+    line_length = line_length_list[0]
+
+    # construct table string
+    append(add_title(line_length, "Overview Summary"))
+    append('Time unit: {}'.format(time_unit))
+    append(header_sep)
+    append(row_format.format(*headers))
+    append(header_sep)
+    row_values = [
+        'Total Time', format_time(
+            total_time, unit=time_unit), format_ratio(1)
+    ]
+    append(row_format.format(*row_values))
+    cpu_type_time = collections.defaultdict(int)
+    gpu_type_time = collections.defaultdict(int)
+    for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items(
+    ):
+        cpu_type_time[event_type] = value
+
+    gpu_time_range = collections.defaultdict(list)
+    for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
+    ):
+        for event_type, time_range in device_time_ranges.items():
+            gpu_time_range[event_type] = merge_ranges(
+                gpu_time_range[event_type], time_range, is_sorted=True)
+    for event_type, time_range in gpu_time_range.items():
+        gpu_type_time[event_type] = sum_ranges(time_range)
+
+    sorted_items = sorted(
+        cpu_type_time.items(), key=lambda x: x[1], reverse=True)
+    for event_type, time in sorted_items:
+        row_values = [
+            '  {}'.format(str(event_type).split('.')[1]), format_time(
+                time, unit=time_unit), format_ratio(float(time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+    append(header_sep)
+    headers = ['', 'GPU Time', 'Ratio (%)']
+    append(row_format.format(*headers))
+    append(header_sep)
+    for event_type, time in gpu_type_time.items():
+        row_values = [
+            '  {}'.format(str(event_type).split('.')[1]), format_time(
+                time, unit=time_unit), format_ratio(float(time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+
+    append(header_sep)
+    append(
+        "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
+        "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
+        "ratio = CPU(GPU) Time / Total Time."
+        "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
+        "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
+        "Example:\n"
+        "Thread 1:\n"
+        "  Operator: |___________|     |__________|\n"
+        "Thread 2:\n"
+        "  Operator:   |____________|     |___|\n"
+        "After merged:\n"
+        "  Result:   |______________|  |__________|\n")
+    append('-' * line_length)
+    append('')
+    append('')
+
+    ###### Print Operator Summary Report ######
+    if statistic_data.event_summary.items:
+        headers = [
+            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            'GPU Total / Avg / Max / Min / Ratio(%)'
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        name_column_width = 50
+        add_column(name_column_width)
+        add_column(6)
+        add_column(40)
+        add_column(40)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Operator Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        if thread_sep == True:
+            thread_items = statistic_data.event_summary.thread_items
+        else:
+            thread_items = {
+                'All threads merged': statistic_data.event_summary.items
+            }
+        for thread_id, items in thread_items.items():
+            append(add_title(line_length, "Thread: {}".format(thread_id)))
+            if sorted_by == SortedKeys.CPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].cpu_time, reverse=True)
+            elif sorted_by == SortedKeys.CPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_cpu_time)
+            elif sorted_by == SortedKeys.GPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+            elif sorted_by == SortedKeys.GPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_gpu_time)
+
+            total_cpu_time = 0
+            total_gpu_time = 0
+            for name, item in sorted_items:
+                total_cpu_time += item.cpu_time
+                total_gpu_time += item.gpu_time
+            for name, item in sorted_items:
+                row_values = [
+                    name, item.call, '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.cpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_cpu_time, unit=time_unit),
+                        format_time(
+                            item.max_cpu_time, unit=time_unit),
+                        format_time(
+                            item.min_cpu_time, unit=time_unit),
+                        format_ratio(float(item.cpu_time) / total_cpu_time)),
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.gpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_gpu_time, unit=time_unit),
+                        format_time(
+                            item.max_gpu_time, unit=time_unit),
+                        format_time(
+                            item.min_gpu_time, unit=time_unit),
+                        format_ratio(float(item.gpu_time) / total_gpu_time))
+                ]
+                append(row_format.format(*row_values))
+                if op_detail:
+                    for innerop_name, innerop_node in item.operator_inners.items(
+                    ):
+                        row_values = [
+                            '  {}'.format(innerop_name), innerop_node.call,
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    innerop_node.cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.avg_cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.max_cpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.min_cpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(innerop_node.cpu_time) /
+                                    total_cpu_time)),
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    innerop_node.gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.avg_gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.max_gpu_time, unit=time_unit),
+                                format_time(
+                                    innerop_node.min_gpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(innerop_node.gpu_time) /
+                                    total_gpu_time))
+                        ]
+                        append(row_format.format(*row_values))
+                        for device_node_name, devicenode in innerop_node.devices.items(
+                        ):
+                            if len(device_node_name) + 4 > name_column_width:
+                                device_node_name = device_node_name[:
+                                                                    name_column_width
+                                                                    - 7]
+                                device_node_name += "..."
+                            row_values = [
+                                '    {}'.format(device_node_name),
+                                devicenode.call, '- / - / - / - / -',
+                                '{} / {} / {} / {} / {}'.format(
+                                    format_time(
+                                        devicenode.gpu_time, unit=time_unit),
+                                    format_time(
+                                        devicenode.avg_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        devicenode.max_gpu_time,
+                                        unit=time_unit),
+                                    format_time(
+                                        devicenode.min_gpu_time,
+                                        unit=time_unit),
+                                    format_ratio(
+                                        float(devicenode.gpu_time) /
+                                        total_gpu_time))
+                            ]
+                            append(row_format.format(*row_values))
+                    for device_node_name, device_node in item.devices.items():
+                        if len(device_node_name) + 2 > name_column_width:
+                            device_node_name = device_node_name[:
+                                                                name_column_width
+                                                                - 5]
+                            device_node_name += "..."
+                        row_values = [
+                            '    {}'.format(device_node_name), devicenode.call,
+                            '- / - / - / - / -',
+                            '{} / {} / {} / {} / {}'.format(
+                                format_time(
+                                    devicenode.gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.avg_gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.max_gpu_time, unit=time_unit),
+                                format_time(
+                                    devicenode.min_gpu_time, unit=time_unit),
+                                format_ratio(
+                                    float(devicenode.gpu_time) /
+                                    total_gpu_time))
+                        ]
+                        append(row_format.format(*row_values))
+        append(header_sep)
+        append('')
+        append('')
+    return ''.join(result)
diff --git a/python/paddle/profiler/statistic_helper.py b/python/paddle/profiler/statistic_helper.py
new file mode 100644
index 00000000000000..1f11649928a7fe
--- /dev/null
+++ b/python/paddle/profiler/statistic_helper.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+
+def sum_ranges(ranges):
+    result = 0
+    for time_range in ranges:
+        result += (time_range[1] - time_range[0])
+    return result
+
+
+def merge_self_ranges(src_ranges, is_sorted=False):
+    merged_ranges = []
+    if len(src_ranges) > 0:
+        if not is_sorted:
+            src_ranges.sort(key=lambda x: x[0])
+        cur_indx = 0
+        merged_ranges.append((src_ranges[cur_indx][0], src_ranges[cur_indx][1]))
+        for cur_indx in range(1, len(src_ranges)):
+            if src_ranges[cur_indx][1] > merged_ranges[-1][1]:
+                if src_ranges[cur_indx][0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0],
+                                         src_ranges[cur_indx][1])
+                else:
+                    merged_ranges.append(
+                        (src_ranges[cur_indx][0], src_ranges[cur_indx][1]))
+    return merged_ranges
+
+
+def merge_ranges(range_list1, range_list2, is_sorted=False):
+    merged_ranges = []
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    if len1 == 0 and len2 == 0:
+        return merged_ranges
+    elif len1 == 0:
+        return range_list2
+    elif len2 == 0:
+        return range_list1
+    else:
+        indx1 = 0
+        indx2 = 0
+        range1 = range_list1[indx1]
+        range2 = range_list2[indx2]
+        if range1[0] < range2[0]:
+            merged_ranges.append(range1)
+            indx1 += 1
+        else:
+            merged_ranges.append(range2)
+            indx2 += 1
+        while indx1 < len1 and indx2 < len2:
+            range1 = range_list1[indx1]
+            range2 = range_list2[indx2]
+            if range1[0] < range2[0]:
+                if range1[1] > merged_ranges[-1][1]:
+                    if range1[0] <= merged_ranges[-1][1]:
+                        merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
+                    else:
+                        merged_ranges.append((range1[0], range1[1]))
+                    indx1 += 1
+                else:
+                    indx1 += 1
+            else:
+                if range2[1] > merged_ranges[-1][1]:
+                    if range2[0] <= merged_ranges[-1][1]:
+                        merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
+                    else:
+                        merged_ranges.append((range2[0], range2[1]))
+                    indx2 += 1
+                else:
+                    indx2 += 1
+
+        while indx1 < len1:
+            range1 = range_list1[indx1]
+            if range1[1] > merged_ranges[-1][1]:
+                if range1[0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0], range1[1])
+                else:
+                    merged_ranges.append((range1[0], range1[1]))
+                indx1 += 1
+            else:
+                indx1 += 1
+        while indx2 < len2:
+            if range2[1] > merged_ranges[-1][1]:
+                if range2[0] <= merged_ranges[-1][1]:
+                    merged_ranges[-1] = (merged_ranges[-1][0], range2[1])
+                else:
+                    merged_ranges.append((range2[0], range2[1]))
+                indx2 += 1
+            else:
+                indx2 += 1
+    return merged_ranges
+
+
+def intersection_ranges(range_list1, range_list2, is_sorted=False):
+    result_range = []
+    if len(range_list1) == 0 or len(range_list2) == 0:
+        return result_range
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    indx1 = 0
+    indx2 = 0
+    range1 = range_list1[indx1]
+    range2 = range_list2[indx2]
+    while indx1 < len1 and indx2 < len2:
+        if range2[1] <= range1[0]:
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] <= range1[0] and range2[1] < range1[1]:
+            assert (range2[1] > range1[0])
+            result_range.append((range1[0], range2[1]))
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] <= range1[0]:
+            assert (range2[1] >= range1[1])
+            result_range.append(range1)
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+
+        elif range2[1] < range1[1]:
+            assert (range2[0] > range1[0])
+            result_range.append(range2)
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 == len2:
+                break
+            range2 = range_list2[indx2]
+
+        elif range2[0] < range1[1]:
+            assert (range2[1] >= range1[1])
+            result_range.append((range2[0], range1[1]))
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+
+        else:
+            assert (range2[0] >= range1[1])
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+    return result_range
+
+
+def subtract_ranges(range_list1, range_list2, is_sorted=False):
+    result_range = []
+    if not is_sorted:
+        range_list1 = merge_self_ranges(range_list1)
+        range_list2 = merge_self_ranges(range_list2)
+    if len(range_list1) == 0:
+        return result_range
+    if len(range_list2) == 0:
+        return range_list1
+
+    len1 = len(range_list1)
+    len2 = len(range_list2)
+    indx1 = 0
+    indx2 = 0
+    range1 = range_list1[indx1]
+    range2 = range_list2[indx2]
+
+    while indx1 < len(range_list1):
+        if indx2 == len(range_list2):
+            result_range.append(range1)
+            indx1 += 1
+            if indx1 == len1:
+                break
+            range1 = range_list1[indx1]
+        elif range2[1] <= range1[0]:
+            indx2 += 1
+            if indx2 != len2:
+                range2 = range_list2[indx2]
+        elif range2[0] <= range1[0] and range2[1] < range1[1]:
+            range1 = (range2[1], range1[1])
+            indx2 += 1
+            if indx2 != len2:
+                range2 = range_list2[indx2]
+        elif range2[0] <= range1[0]:
+            assert (range2[1] >= range1[1])
+            range2 = (range1[1], range2[1])
+            indx1 += 1
+            if indx1 != len1:
+                range1 = range_list1[indx1]
+        elif range2[0] < range1[1]:
+            assert (range2[0] > range1[0])
+            result_range.append((range1[0], range2[0]))
+            range1 = (range2[0], range1[1])
+        else:
+            assert (range2[0] >= range1[1])
+            result_range.append(range1)
+            indx1 += 1
+            if indx1 != len1:
+                range1 = range_list1[indx1]
+    return result_range
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
new file mode 100644
index 00000000000000..642001dfbfc5a3
--- /dev/null
+++ b/python/paddle/profiler/utils.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.core import (_RecordEvent, TracerEventType,
+                               load_profiler_result)
+from typing import Any
+from warnings import warn
+import functools
+from contextlib import ContextDecorator
+
+_AllowedEventTypeList = [
+    TracerEventType.Dataloader, TracerEventType.ProfileStep,
+    TracerEventType.UserDefined, TracerEventType.Forward,
+    TracerEventType.Backward, TracerEventType.Optimization,
+    TracerEventType.PythonOp, TracerEventType.PythonUserDefined
+]
+
+
+class RecordEvent(ContextDecorator):
+    r"""
+    Interface for recording a time range.
+
+    Parameters:
+    name(str): Name of the record event
+    event_type(TracerEventType): Type of the record event, can be used for statistics.
+
+    Examples:
+        .. code-block:: python
+        import paddle.profiler as profiler
+        with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined):
+            op1()
+    """
+
+    def __init__(self,
+                 name: str,
+                 event_type: TracerEventType=TracerEventType.UserDefined):
+        self.name = name
+        self.event_type = event_type
+        self.event = None
+
+    def __enter__(self):
+        self.begin()
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
+        self.end()
+
+    def begin(self):
+        if self.event_type not in _AllowedEventTypeList:
+            warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
+                  can be recorded.".format(*_AllowedEventTypeList))
+            self.event = None
+        else:
+            if self.event_type == TracerEventType.UserDefined:
+                self.event_type == TracerEventType.PythonUserDefined
+            self.event = _RecordEvent(self.name, self.event_type)
+
+    def end(self):
+        if self.event:
+            self.event.end()
+
+
+def wrap_optimizers():
+    def optimizer_warpper(func):
+        @functools.wraps(func)
+        def warpper(*args, **kwargs):
+            with RecordEvent(
+                    'Optimization Step',
+                    event_type=TracerEventType.Optimization):
+                return func(*args, **kwargs)
+
+        return warpper
+
+    import paddle.optimizer as optimizer
+    for classname in optimizer.__all__:
+        if classname != 'Optimizer':
+            classobject = getattr(optimizer, classname)
+            if getattr(classobject, 'step', None) != None:
+                classobject.step = optimizer_warpper(classobject.step)
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index bddc45bc9612c3..6555ba0812d08c 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -283,7 +283,7 @@ def ones_like(x, dtype=None, name=None):
     Args:
         x(Tensor): The input tensor which specifies shape and dtype. The
             dtype of ``x`` can be bool, float16, float32, float64, int32, int64.
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+        dtype(str|np.dtype, optional): The data type of the
             output tensor. Supported data types: bool, float16, float32, float64,
             int32, int64. If ``dtype`` is None, the data type is the same as ``x``.
             Default is None.
@@ -358,7 +358,7 @@ def zeros_like(x, dtype=None, name=None):
     Args:
         x(Tensor): The input tensor which specifies shape and dtype. The
             dtype of ``x`` can be bool, float16, float32, float64, int32, int64.
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+        dtype(str|np.dtype, optional): The data type of the
             output tensor. Supported data types: bool, float16, float32, float64,
             int32, int64. If ``dtype`` is None, the data type is the same as ``x``.
             Default is None.
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 468aa460486275..dd0da03e4fd281 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -321,6 +321,9 @@ def median(x, axis=None, keepdim=False, name=None):
             paddle.slice(
                 tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]),
             dtype=dtype)
+    out_tensor = out_tensor + paddle.sum(
+        paddle.cast(
+            paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True)
     if not keepdim or is_flatten:
         if not is_flatten:
             newshape = x.shape[:axis] + x.shape[axis + 1:]
diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index bbde64f2e609cd..4441faee14e021 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -61,6 +61,8 @@ def test_models(self):
         arches = [
             'mobilenet_v1',
             'mobilenet_v2',
+            'mobilenet_v3_small',
+            'mobilenet_v3_large',
             'squeezenet1_0',
             'shufflenet_v2_x0_25',
         ]
diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index 547c53345995c2..dc98fc3219bff6 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -40,6 +40,12 @@ def test_mobilenetv2_pretrained(self):
     def test_mobilenetv1(self):
         self.models_infer('mobilenet_v1')
 
+    def test_mobilenetv3_small(self):
+        self.models_infer('mobilenet_v3_small')
+
+    def test_mobilenetv3_large(self):
+        self.models_infer('mobilenet_v3_large')
+
     def test_vgg11(self):
         self.models_infer('vgg11')
 
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 699e42f23732a0..6c27d465cb12e3 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -121,10 +121,10 @@
   backward : matmul_grad
 
 - api : mean
-  args : (Tensor x, int64_t[] axis={}, bool keep_dim=false)
+  args : (Tensor x, int64[] axis={}, bool keep_dim=false)
   output : Tensor
   infer_meta :
-    func : MeanInferMeta
+    func : ReduceInferMeta
   kernel :
     func : mean
 
@@ -181,7 +181,7 @@
     func : subtract
 
 - api : sum
-  args : (Tensor x, int64_t[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
+  args : (Tensor x, int64[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
   output : Tensor
   infer_meta :
     func : SumInferMeta
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 601248a4176398..fe68548a22a6d9 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -89,18 +89,20 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
         attr_types_map = {
             'ScalarArray': 'const ScalarArray&',
             'Scalar': 'const Scalar&',
+            'uint8': 'uint8_t',
             'int': 'int',
-            'int32_t': 'int32_t',
-            'int64_t': 'int64_t',
+            'int32': 'int32_t',
+            'int64': 'int64_t',
             'long': 'long',
             'size_t': 'size_t',
             'float': 'float',
             'double': 'double',
             'bool': 'bool',
+            'str': 'const std::string&',
             'Backend': 'Backend',
             'DataLayout': 'DataLayout',
             'DataType': 'DataType',
-            'int64_t[]': 'const std::vector<int64_t>&',
+            'int64[]': 'const std::vector<int64_t>&',
             'int[]': 'const std::vector<int>&',
             'long[]': 'const std::vector<int64_t>&'
         }
@@ -110,8 +112,8 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'ScalarArray': 'const paddle::optional<ScalarArray>&',
             'Scalar': 'const paddle::optional<Scalar>&',
             'int': 'paddle::optional<int>',
-            'int32_t': 'paddle::optional<int32_t>',
-            'int64_t': 'paddle::optional<int64_t>',
+            'int32': 'paddle::optional<int32_t>',
+            'int64': 'paddle::optional<int64_t>',
             'size_t': 'paddle::optional<size_t>',
             'float': 'paddle::optional<float>',
             'double': 'paddle::optional<double>',
@@ -119,7 +121,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'Backend': 'paddle::optional<Backend>',
             'DataLayout': 'paddle::optional<DataLayout>',
             'DataType': 'paddle::optional<DataType>',
-            'int64_t[]': 'paddle::optional<std::vector<int64_t>>',
+            'int64[]': 'paddle::optional<std::vector<int64_t>>',
             'int[]': 'paddle::optional<std::vector<int>>'
         }
 
@@ -299,12 +301,12 @@ def get_return_type(self, out_type_list):
 
     def gene_api_declaration(self):
         api_declaration = f"""
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name()}({self.args_str['args_declare']});
 """
 
         if self.is_base_api and self.inplace_map is not None:
             api_declaration = api_declaration + f"""
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.args_str['args_declare']});
+PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self.args_str['args_declare']});
 """
 
         return api_declaration
@@ -673,6 +675,14 @@ def get_selected_rows_kernel_args(self, code_indent):
 
         return input_tensor_code, kernel_args[:-2], kernel_signature
 
+    # Override by child class
+    def gene_return_type_code(self):
+        return self.outputs['return_type']
+
+    # Override by child class
+    def gene_return_code(self):
+        return "api_output"
+
     # Override by child class
     def gene_output(self,
                     output_type_list,
@@ -701,7 +711,7 @@ def gen_dense_tensor_kernel_code(self, code_indent, inplace_flag=False):
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 {code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
 
-{code_indent}  return out;"""
+{code_indent}  return {self.gene_return_code()};"""
 
     def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
         input_tensors, kernel_args, kernel_signature = self.get_selected_rows_kernel_args(
@@ -724,12 +734,12 @@ def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 {code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
 
-{code_indent}  return out;"""
+{code_indent}  return {self.gene_return_code()};"""
 
     def gene_base_api_code(self, inplace_flag=False):
         api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         api_code = f"""
-PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+PADDLE_API {self.gene_return_type_code()} {api_func_name}({self.args_str["args_define"]}) {{
 {self.gene_kernel_select()}
 """
 
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 1bdfa8b66972eb..058cc08465ff04 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -23,7 +23,8 @@
 class ForwardAPI(BaseAPI):
     def __init__(self, api_item_yaml):
         super(ForwardAPI, self).__init__(api_item_yaml)
-        self.is_dygraph_api = self.parse_intermediate(api_item_yaml)
+        self.is_dygraph_api, self.intermediate_outs = self.parse_intermediate(
+            api_item_yaml)
 
     def get_api_func_name(self):
         if self.is_dygraph_api:
@@ -33,15 +34,47 @@ def get_api_func_name(self):
 
     def parse_intermediate(self, api_item_yaml):
         if 'intermediate' in api_item_yaml:
-            return True
+            intermediate_outs = [
+                item.strip()
+                for item in api_item_yaml['intermediate'].split(',')
+            ]
+            return True, intermediate_outs
         else:
-            return False
+            return False, []
 
     def get_return_type(self, out_type_list):
         return out_type_list[0] if len(
             out_type_list) == 1 else "std::tuple<" + ",".join(
                 out_type_list) + ">"
 
+    def gene_return_type_code(self):
+        if self.is_dygraph_api or len(self.intermediate_outs) == 0:
+            return self.outputs['return_type']
+        else:
+            return_out_list = []
+            for i, name in enumerate(self.outputs['names']):
+                if name not in self.intermediate_outs:
+                    return_out_list.append(self.outputs['types'][i])
+            return return_out_list[0] if len(
+                return_out_list) == 1 else "std::tuple<" + ",".join(
+                    return_out_list) + ">"
+
+    def gene_return_code(self):
+        if self.is_dygraph_api or len(self.intermediate_outs) == 0:
+            return "api_output"
+        else:
+            return_out_list = []
+            for i, name in enumerate(self.outputs['names']):
+                if name not in self.intermediate_outs:
+                    return_out_list.append(i)
+            if len(return_out_list) == 1:
+                return f"std::get<{return_out_list[0]}>(api_output)"
+            else:
+                selected_code = [
+                    f"std::get<{i}>(api_output)" for i in return_out_list
+                ]
+            return '{' + ", ".join(selected_code) + '}'
+
     def gene_output(self,
                     output_type_list,
                     set_out_func,
@@ -58,12 +91,12 @@ def gene_output(self,
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out{inplace_assign};
-{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &out);"""
+{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};
+{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &api_output);"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out;"""
+{code_indent}  {self.outputs['return_type']} api_output;"""
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
@@ -71,10 +104,10 @@ def gene_output(self,
                 if inplace_flag and self.inplace_map is not None and self.outputs[
                         'names'][i] in self.inplace_map:
                     output_create = output_create + f"""
-{code_indent}  std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};"""
+{code_indent}  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, &std::get<{i}>(out));"""
+{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, &std::get<{i}>(api_output));"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -169,6 +202,10 @@ def generate_api(api_yaml_path, header_file_path, source_file_path,
         if foward_api.is_dygraph_api:
             dygraph_header_file.write(foward_api.gene_api_declaration())
             dygraph_source_file.write(foward_api.gene_api_code())
+
+            foward_api.is_dygraph_api = False
+            header_file.write(foward_api.gene_api_declaration())
+            source_file.write(foward_api.gene_api_code())
         else:
             header_file.write(foward_api.gene_api_declaration())
             source_file.write(foward_api.gene_api_code())
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index b9f991f9b0f88d..7417d6bb030da0 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -35,6 +35,7 @@ def parse_forward_config(self, forward_config):
             forward_config)
         api = result.group('api')
         _, outputs, _ = self.parse_output(self.api, result.group('outputs'))
+        outputs = [item.split('@')[0] for item in outputs]
         fw_inputs, fw_attrs, _, = self.parse_input_and_attr(
             api, result.group('args'))
 
@@ -86,33 +87,33 @@ def gene_output(self,
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out{inplace_assign};
-{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &out);"""
+{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};
+{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &api_output);"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} out({len(output_type_list)});"""
+{code_indent}  {self.outputs['return_type']} api_output({len(output_type_list)});"""
 
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
                 if out_type_item == 'Tensor':
-                    get_out_code = f'&out[{i}][0]'
+                    get_out_code = f'&api_output[{i}][0]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  out[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
+{code_indent}  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
 
                     else:
                         output_create = output_create + f"""
-{code_indent}  out[{i}].emplace_back();"""
+{code_indent}  api_output[{i}].emplace_back();"""
 
                 else:
-                    get_out_code = f'&out[{i}]'
+                    get_out_code = f'&api_output[{i}]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  out[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+{code_indent}  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
 {code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, {get_out_code});"""
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 135989121cca69..2f233a2df357df 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -1,21 +1,21 @@
-- sparse_api : conv3d
+- api : conv3d
   args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups)
   output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
   kernel :
     func : sparse_conv3d
     layout : x
 
-- sparse_api : to_dense
+- api : to_dense
   args : (Tensor x, Backend backend)
   output : Tensor(out@DenseTensor)
   invoke : to_dense_impl(x, backend)
 
-- sparse_api : to_sparse_coo
-  args : (Tensor x, Backend backend, int64_t sparse_dim)
+- api : to_sparse_coo
+  args : (Tensor x, Backend backend, int64 sparse_dim)
   output : Tensor(out@SparseCooTensor)
   invoke : to_sparse_coo_impl(x, backend, sparse_dim)
 
-- sparse_api : to_sparse_csr
+- api : to_sparse_csr
   args : (Tensor x, Backend backend)
   output : Tensor(out@SparseCsrTensor)
   invoke : to_sparse_csr_impl(x, backend)
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index 99c5a4f49f8c41..3838ac01c747c3 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -17,24 +17,16 @@
 import argparse
 import re
 
-from api_base import BaseAPI
+from api_gen import ForwardAPI
 
 
-class SparseAPI(BaseAPI):
+class SparseAPI(ForwardAPI):
     def __init__(self, api_item_yaml):
         super(SparseAPI, self).__init__(api_item_yaml)
 
-    def get_api_name(self, api_item_yaml):
-        return api_item_yaml['sparse_api']
-
     def get_api_func_name(self):
         return self.api
 
-    def get_return_type(self, out_type_list):
-        return out_type_list[0] if len(
-            out_type_list) == 1 else "std::tuple<" + ",".join(
-                out_type_list) + ">"
-
     def gene_api_declaration(self):
         return f"""
 // {", ".join(self.outputs['names'])}
@@ -65,12 +57,12 @@ def gene_output(self,
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-  {self.outputs['return_type']} out{inplace_assign};
-  auto* kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+  {self.outputs['return_type']} api_output{inplace_assign};
+  auto* kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-  {self.outputs['return_type']} out;"""
+  {self.outputs['return_type']} api_output;"""
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
@@ -78,10 +70,10 @@ def gene_output(self,
                 if inplace_flag and self.inplace_map is not None and self.outputs[
                         'names'][i] in self.inplace_map:
                     output_create = output_create + f"""
-  std::get<{i}>(out) = {self.inplace_map[self.outputs['names'][i]]};"""
+  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
-  auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(out), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+  auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(api_output), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -160,7 +152,7 @@ def gen_sparse_kernel_code(self, inplace_flag=False):
 {kernel_context_code}
   phi_kernel(&kernel_context);
 
-  return out;"""
+  return api_output;"""
 
     def gene_base_api_code(self, inplace_flag=False):
         api_func_name = self.get_api_func_name()
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
new file mode 100644
index 00000000000000..8c9f02ebb31986
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -0,0 +1,6 @@
+- backward_api : conv3d_grad
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups)
+  output : Tensor(x_grad@DenseTensor), Tensor(kernel_grad@DenseTensor)
+  kernel :
+    func : sparse_conv_grad
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
new file mode 100644
index 00000000000000..ede4de2bdd6796
--- /dev/null
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+import re
+
+from sparse_api_gen import SparseAPI
+from backward_api_gen import BackwardAPI
+
+
+class SparseBackwardAPI(SparseAPI, BackwardAPI):
+    def __init__(self, bw_api_item_yaml):
+        BackwardAPI.__init__(self, bw_api_item_yaml)
+
+    def get_api_func_name(self):
+        return self.api
+
+    def get_return_type(self, out_type_list):
+        return BackwardAPI.get_return_type(self, out_type_list)
+
+    def gene_api_declaration(self):
+        return SparseAPI.gene_api_declaration(self)
+
+    def gene_output(self,
+                    output_type_list,
+                    set_out_func,
+                    code_indent,
+                    inplace_flag=False):
+        kernel_output = ""
+        output_names = []
+        output_create = ""
+
+        if len(output_type_list) == 1:
+            kernel_output = 'kernel_out'
+            output_names.append('kernel_out')
+            inplace_assign = " = " + self.inplace_map[self.outputs['names'][
+                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
+                    'names'][0] in self.inplace_map else ""
+            output_create = f"""
+  {self.outputs['return_type']} api_output{inplace_assign};
+  auto kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+
+        elif len(output_type_list) > 1:
+            output_create = f"""
+  {self.outputs['return_type']} api_output({len(output_type_list)});"""
+
+            for i, out_type_item in enumerate(output_type_list):
+                kernel_output = kernel_output + f'kernel_out_{i}, '
+                output_names.append(f'kernel_out_{i}')
+                if out_type_item == 'Tensor':
+                    get_out_code = f'&api_output[{i}][0]'
+                    if inplace_flag and self.inplace_map is not None and self.outputs[
+                            'names'][i] in self.inplace_map:
+                        output_create = output_create + f"""
+  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
+
+                    else:
+                        output_create = output_create + f"""
+  api_output[{i}].emplace_back();"""
+
+                else:
+                    get_out_code = f'&api_output[{i}]'
+                    if inplace_flag and self.inplace_map is not None and self.outputs[
+                            'names'][i] in self.inplace_map:
+                        output_create = output_create + f"""
+  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+
+                output_create = output_create + f"""
+  auto kernel_out_{i} = {set_out_func}({get_out_code}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+
+            kernel_output = kernel_output[:-2]
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.api))
+
+        return kernel_output, output_names, output_create
+
+
+def header_include():
+    return """
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/utils/optional.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/phi/api/lib/api_registry.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/declarations.h"
+"""
+
+
+def api_register():
+    return """
+PD_REGISTER_API(Test);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+namespace sparse {
+
+""", """
+
+}  // namespace sparse
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/phi/api/backward/sparse_bw_api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+
+    for api in apis:
+        sparse_bw_api = SparseBackwardAPI(api)
+        header_file.write(sparse_bw_api.gene_api_declaration())
+        source_file.write(sparse_bw_api.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+
+    source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ Sparse API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to sparse api yaml file',
+        default='python/paddle/utils/code_gen/sparse_bw_api.yaml')
+
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/phi/api/backward/sparse_bw_api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/phi/api/lib/sparse_bw_api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py
index 37520175a719f7..3749e0f64fc6a8 100644
--- a/python/paddle/vision/__init__.py
+++ b/python/paddle/vision/__init__.py
@@ -40,6 +40,10 @@
 from .models import mobilenet_v1  # noqa: F401
 from .models import MobileNetV2  # noqa: F401
 from .models import mobilenet_v2  # noqa: F401
+from .models import MobileNetV3Small  # noqa: F401
+from .models import MobileNetV3Large  # noqa: F401
+from .models import mobilenet_v3_small  # noqa: F401
+from .models import mobilenet_v3_large  # noqa: F401
 from .models import SqueezeNet  # noqa: F401
 from .models import squeezenet1_0  # noqa: F401
 from .models import squeezenet1_1  # noqa: F401
diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py
index 044be6a42b7c28..5ff3562e56ea8c 100644
--- a/python/paddle/vision/models/__init__.py
+++ b/python/paddle/vision/models/__init__.py
@@ -24,6 +24,10 @@
 from .mobilenetv1 import mobilenet_v1  # noqa: F401
 from .mobilenetv2 import MobileNetV2  # noqa: F401
 from .mobilenetv2 import mobilenet_v2  # noqa: F401
+from .mobilenetv3 import MobileNetV3Small  # noqa: F401
+from .mobilenetv3 import MobileNetV3Large  # noqa: F401
+from .mobilenetv3 import mobilenet_v3_small  # noqa: F401
+from .mobilenetv3 import mobilenet_v3_large  # noqa: F401
 from .vgg import VGG  # noqa: F401
 from .vgg import vgg11  # noqa: F401
 from .vgg import vgg13  # noqa: F401
@@ -79,6 +83,10 @@
     'mobilenet_v1',
     'MobileNetV2',
     'mobilenet_v2',
+    'MobileNetV3Small',
+    'MobileNetV3Large',
+    'mobilenet_v3_small',
+    'mobilenet_v3_large',
     'LeNet',
     'DenseNet',
     'densenet121',
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index 74071fc121688e..6c486037c7d305 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import paddle
-
 import paddle.nn as nn
-import paddle.nn.functional as F
-
 from paddle.utils.download import get_weights_path_from_url
 
+from .utils import _make_divisible
+
 __all__ = []
 
 model_urls = {
@@ -29,16 +27,6 @@
 }
 
 
-def _make_divisible(v, divisor, min_value=None):
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
 class ConvBNReLU(nn.Sequential):
     def __init__(self,
                  in_planes,
diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py
new file mode 100644
index 00000000000000..da7ae010c58f6b
--- /dev/null
+++ b/python/paddle/vision/models/mobilenetv3.py
@@ -0,0 +1,445 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle.utils.download import get_weights_path_from_url
+from functools import partial
+
+from .utils import _make_divisible
+from ..ops import ConvNormActivation
+
+__all__ = []
+
+model_urls = {
+    "mobilenet_v3_small_x1.0":
+    ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_small_x1.0.pdparams",
+     "34fe0e7c1f8b00b2b056ad6788d0590c"),
+    "mobilenet_v3_large_x1.0":
+    ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_large_x1.0.pdparams",
+     "118db5792b4e183b925d8e8e334db3df"),
+}
+
+
+class SqueezeExcitation(nn.Layer):
+    """
+    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
+    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.
+    This code is based on the torchvision code with modifications.
+    You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L127
+    Args:
+        input_channels (int): Number of channels in the input image
+        squeeze_channels (int): Number of squeeze channels
+        activation (Callable[..., paddle.nn.Layer], optional): ``delta`` activation. Default: ``paddle.nn.ReLU``
+        scale_activation (Callable[..., paddle.nn.Layer]): ``sigma`` activation. Default: ``paddle.nn.Sigmoid``
+    """
+
+    def __init__(self,
+                 input_channels,
+                 squeeze_channels,
+                 activation=nn.ReLU,
+                 scale_activation=nn.Sigmoid):
+        super().__init__()
+        self.avgpool = nn.AdaptiveAvgPool2D(1)
+        self.fc1 = nn.Conv2D(input_channels, squeeze_channels, 1)
+        self.fc2 = nn.Conv2D(squeeze_channels, input_channels, 1)
+        self.activation = activation()
+        self.scale_activation = scale_activation()
+
+    def _scale(self, input):
+        scale = self.avgpool(input)
+        scale = self.fc1(scale)
+        scale = self.activation(scale)
+        scale = self.fc2(scale)
+        return self.scale_activation(scale)
+
+    def forward(self, input):
+        scale = self._scale(input)
+        return scale * input
+
+
+class InvertedResidualConfig:
+    def __init__(self,
+                 in_channels,
+                 kernel,
+                 expanded_channels,
+                 out_channels,
+                 use_se,
+                 activation,
+                 stride,
+                 scale=1.0):
+        self.in_channels = self.adjust_channels(in_channels, scale=scale)
+        self.kernel = kernel
+        self.expanded_channels = self.adjust_channels(
+            expanded_channels, scale=scale)
+        self.out_channels = self.adjust_channels(out_channels, scale=scale)
+        self.use_se = use_se
+        if activation is None:
+            self.activation_layer = None
+        elif activation == "relu":
+            self.activation_layer = nn.ReLU
+        elif activation == "hardswish":
+            self.activation_layer = nn.Hardswish
+        else:
+            raise RuntimeError("The activation function is not supported: {}".
+                               format(activation))
+        self.stride = stride
+
+    @staticmethod
+    def adjust_channels(channels, scale=1.0):
+        return _make_divisible(channels * scale, 8)
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, in_channels, expanded_channels, out_channels,
+                 filter_size, stride, use_se, activation_layer, norm_layer):
+        super().__init__()
+        self.use_res_connect = stride == 1 and in_channels == out_channels
+        self.use_se = use_se
+        self.expand = in_channels != expanded_channels
+
+        if self.expand:
+            self.expand_conv = ConvNormActivation(
+                in_channels=in_channels,
+                out_channels=expanded_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer)
+
+        self.bottleneck_conv = ConvNormActivation(
+            in_channels=expanded_channels,
+            out_channels=expanded_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            groups=expanded_channels,
+            norm_layer=norm_layer,
+            activation_layer=activation_layer)
+
+        if self.use_se:
+            self.mid_se = SqueezeExcitation(
+                expanded_channels,
+                _make_divisible(expanded_channels // 4),
+                scale_activation=nn.Hardsigmoid)
+
+        self.linear_conv = ConvNormActivation(
+            in_channels=expanded_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            norm_layer=norm_layer,
+            activation_layer=None)
+
+    def forward(self, x):
+        identity = x
+        if self.expand:
+            x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.use_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.use_res_connect:
+            x = paddle.add(identity, x)
+        return x
+
+
+class MobileNetV3(nn.Layer):
+    """MobileNetV3 model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        config (list[InvertedResidualConfig]): MobileNetV3 depthwise blocks config.
+        last_channel (int): The number of channels on the penultimate layer.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+    """
+
+    def __init__(self,
+                 config,
+                 last_channel,
+                 scale=1.0,
+                 num_classes=1000,
+                 with_pool=True):
+        super().__init__()
+
+        self.config = config
+        self.scale = scale
+        self.last_channel = last_channel
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        self.firstconv_in_channels = config[0].in_channels
+        self.lastconv_in_channels = config[-1].in_channels
+        self.lastconv_out_channels = self.lastconv_in_channels * 6
+        norm_layer = partial(nn.BatchNorm2D, epsilon=0.001, momentum=0.99)
+
+        self.conv = ConvNormActivation(
+            in_channels=3,
+            out_channels=self.firstconv_in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=1,
+            activation_layer=nn.Hardswish,
+            norm_layer=norm_layer)
+
+        self.blocks = nn.Sequential(*[
+            InvertedResidual(
+                in_channels=cfg.in_channels,
+                expanded_channels=cfg.expanded_channels,
+                out_channels=cfg.out_channels,
+                filter_size=cfg.kernel,
+                stride=cfg.stride,
+                use_se=cfg.use_se,
+                activation_layer=cfg.activation_layer,
+                norm_layer=norm_layer) for cfg in self.config
+        ])
+
+        self.lastconv = ConvNormActivation(
+            in_channels=self.lastconv_in_channels,
+            out_channels=self.lastconv_out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            norm_layer=norm_layer,
+            activation_layer=nn.Hardswish)
+
+        if with_pool:
+            self.avgpool = nn.AdaptiveAvgPool2D(1)
+
+        if num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(self.lastconv_out_channels, self.last_channel),
+                nn.Hardswish(),
+                nn.Dropout(p=0.2),
+                nn.Linear(self.last_channel, num_classes))
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.lastconv(x)
+
+        if self.with_pool:
+            x = self.avgpool(x)
+
+        if self.num_classes > 0:
+            x = paddle.flatten(x, 1)
+            x = self.classifier(x)
+
+        return x
+
+
+class MobileNetV3Small(MobileNetV3):
+    """MobileNetV3 Small architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import MobileNetV3Small
+
+            # build model
+            model = MobileNetV3Small(scale=1.0)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+    """
+
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        config = [
+            InvertedResidualConfig(16, 3, 16, 16, True, "relu", 2, scale),
+            InvertedResidualConfig(16, 3, 72, 24, False, "relu", 2, scale),
+            InvertedResidualConfig(24, 3, 88, 24, False, "relu", 1, scale),
+            InvertedResidualConfig(24, 5, 96, 40, True, "hardswish", 2, scale),
+            InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale),
+            InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale),
+            InvertedResidualConfig(40, 5, 120, 48, True, "hardswish", 1, scale),
+            InvertedResidualConfig(48, 5, 144, 48, True, "hardswish", 1, scale),
+            InvertedResidualConfig(48, 5, 288, 96, True, "hardswish", 2, scale),
+            InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale),
+            InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale),
+        ]
+        last_channel = _make_divisible(1024 * scale, 8)
+        super().__init__(
+            config,
+            last_channel=last_channel,
+            scale=scale,
+            with_pool=with_pool,
+            num_classes=num_classes)
+
+
+class MobileNetV3Large(MobileNetV3):
+    """MobileNetV3 Large architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import MobileNetV3Large
+
+            # build model
+            model = MobileNetV3Large(scale=1.0)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+    """
+
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        config = [
+            InvertedResidualConfig(16, 3, 16, 16, False, "relu", 1, scale),
+            InvertedResidualConfig(16, 3, 64, 24, False, "relu", 2, scale),
+            InvertedResidualConfig(24, 3, 72, 24, False, "relu", 1, scale),
+            InvertedResidualConfig(24, 5, 72, 40, True, "relu", 2, scale),
+            InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale),
+            InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale),
+            InvertedResidualConfig(40, 3, 240, 80, False, "hardswish", 2,
+                                   scale),
+            InvertedResidualConfig(80, 3, 200, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(80, 3, 480, 112, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(112, 3, 672, 112, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(112, 5, 672, 160, True, "hardswish", 2,
+                                   scale),
+            InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1,
+                                   scale),
+            InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1,
+                                   scale),
+        ]
+        last_channel = _make_divisible(1280 * scale, 8)
+        super().__init__(
+            config,
+            last_channel=last_channel,
+            scale=scale,
+            with_pool=with_pool,
+            num_classes=num_classes)
+
+
+def _mobilenet_v3(arch, pretrained=False, scale=1.0, **kwargs):
+    if arch == "mobilenet_v3_large":
+        model = MobileNetV3Large(scale=scale, **kwargs)
+    else:
+        model = MobileNetV3Small(scale=scale, **kwargs)
+    if pretrained:
+        arch = "{}_x{}".format(arch, scale)
+        assert (
+            arch in model_urls
+        ), "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path_from_url(model_urls[arch][0],
+                                                model_urls[arch][1])
+
+        param = paddle.load(weight_path)
+        model.set_dict(param)
+    return model
+
+
+def mobilenet_v3_small(pretrained=False, scale=1.0, **kwargs):
+    """MobileNetV3 Small architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import mobilenet_v3_small
+
+            # build model
+            model = mobilenet_v3_small()
+
+            # build model and load imagenet pretrained weight
+            # model = mobilenet_v3_small(pretrained=True)
+
+            # build mobilenet v3 small model with scale=0.5
+            model = mobilenet_v3_small(scale=0.5)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    model = _mobilenet_v3(
+        "mobilenet_v3_small", scale=scale, pretrained=pretrained, **kwargs)
+    return model
+
+
+def mobilenet_v3_large(pretrained=False, scale=1.0, **kwargs):
+    """MobileNetV3 Large architecture model from
+    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale (float, optional): Scale of channels in each layer. Default: 1.0.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import mobilenet_v3_large
+
+            # build model
+            model = mobilenet_v3_large()
+
+            # build model and load imagenet pretrained weight
+            # model = mobilenet_v3_large(pretrained=True)
+
+            # build mobilenet v3 large model with scale=0.5
+            model = mobilenet_v3_large(scale=0.5)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    model = _mobilenet_v3(
+        "mobilenet_v3_large", scale=scale, pretrained=pretrained, **kwargs)
+    return model
diff --git a/python/paddle/vision/models/utils.py b/python/paddle/vision/models/utils.py
new file mode 100644
index 00000000000000..f61d0d601a44f5
--- /dev/null
+++ b/python/paddle/vision/models/utils.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    """
+    This function ensures that all layers have a channel number that is divisible by divisor
+    You can also see at https://github.com/keras-team/keras/blob/8ecef127f70db723c158dbe9ed3268b3d610ab55/keras/applications/mobilenet_v2.py#L505
+
+    Args:
+        divisor (int): The divisor for number of channels. Default: 8.
+        min_value (int, optional): The minimum value of number of channels, if it is None,
+                the default is divisor. Default: None.
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 03060e92bdb69b..b65bfa502c4dfe 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -17,7 +17,7 @@
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid import core, layers
 from ..fluid.layers import nn, utils
-from ..nn import Layer
+from ..nn import Layer, Conv2D, Sequential, ReLU, BatchNorm2D
 from ..fluid.initializer import Normal
 
 from paddle.common_ops_import import *
@@ -195,7 +195,7 @@ def yolo_loss(x,
     """
 
     if in_dygraph_mode() and gt_score is None:
-        loss = _C_ops.yolov3_loss(
+        loss, _, _ = _C_ops.yolov3_loss(
             x, gt_box, gt_label, 'anchors', anchors, 'anchor_mask', anchor_mask,
             'class_num', class_num, 'ignore_thresh', ignore_thresh,
             'downsample_ratio', downsample_ratio, 'use_label_smooth',
@@ -1297,3 +1297,57 @@ def forward(self, x, boxes, boxes_num, aligned=True):
             output_size=self._output_size,
             spatial_scale=self._spatial_scale,
             aligned=aligned)
+
+
+class ConvNormActivation(Sequential):
+    """
+    Configurable block used for Convolution-Normalzation-Activation blocks.
+    This code is based on the torchvision code with modifications.
+    You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L68
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block
+        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None,
+            in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        norm_layer (Callable[..., paddle.nn.Layer], optional): Norm layer that will be stacked on top of the convolutiuon layer.
+            If ``None`` this layer wont be used. Default: ``paddle.nn.BatchNorm2d``
+        activation_layer (Callable[..., paddle.nn.Layer], optional): Activation function which will be stacked on top of the normalization
+            layer (if not ``None``), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``paddle.nn.ReLU``
+        dilation (int): Spacing between kernel elements. Default: 1
+        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=None,
+                 groups=1,
+                 norm_layer=BatchNorm2D,
+                 activation_layer=ReLU,
+                 dilation=1,
+                 bias=None):
+        if padding is None:
+            padding = (kernel_size - 1) // 2 * dilation
+        if bias is None:
+            bias = norm_layer is None
+        layers = [
+            Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation=dilation,
+                groups=groups,
+                bias_attr=bias)
+        ]
+        if norm_layer is not None:
+            layers.append(norm_layer(out_channels))
+        if activation_layer is not None:
+            layers.append(activation_layer())
+        super().__init__(*layers)
diff --git a/python/setup.py.in b/python/setup.py.in
index 0bc32cfbc00bd3..3ce22892b6ee4d 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -280,6 +280,7 @@ packages=['paddle',
           'paddle.incubate.nn',
           'paddle.incubate.passes',
           'paddle.distribution',
+          'paddle.distributed.sharding',
           'paddle.distributed.fleet',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.elastic',
@@ -372,6 +373,7 @@ packages=['paddle',
           'paddle.device',
           'paddle.device.cuda',
           'paddle.version',
+          'paddle.profiler'
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
diff --git a/tools/document_preview.sh b/tools/document_preview.sh
index 83c758d0aa8b8f..424169bbc51279 100755
--- a/tools/document_preview.sh
+++ b/tools/document_preview.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,19 +14,155 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-PADDLE_ROOT=/home
-mkdir ${PADDLE_ROOT}
-cd ${PADDLE_ROOT}
-pip install /paddle/build/opt/paddle/share/wheels/*.whl
-git clone https://github.com/PaddlePaddle/FluidDoc
-git clone https://github.com/tianshuo78520a/PaddlePaddle.org.git
-cd  ${PADDLE_ROOT}/PaddlePaddle.org
-git reset 3feaa68376d8423e41d076814e901e6bf108c705
-cd ${PADDLE_ROOT}/FluidDoc/doc/fluid/api
-sh gen_doc.sh
-apt-get update && apt-get install -y python-dev build-essential
-cd ${PADDLE_ROOT}/PaddlePaddle.org/portal
-pip install -r requirements.txt
-#If the default port is not occupied, you can use port 8000, you need to replace it with a random port on the CI.
-sed -i "s#8000#$1#g" runserver
-nohup ./runserver --paddle ${PADDLE_ROOT}/FluidDoc &
+is_shell_attribute_set() { # attribute, like "x"
+  case "$-" in
+    *"$1"*) return 0 ;;
+    *)    return 1 ;;
+  esac
+}
+function get_docs_pr_num_from_paddle_pr_info(){
+    # get_repo_pr_info's output
+    pr_info_file=$1
+    if [ ! -r ${pr_info_file} ] ; then
+        return 1
+    fi
+
+    declare -A arr_kv
+    while read line
+    do
+        echo "$line" | grep '^\w\+\s*=\s*.*' > /dev/null
+        if [ $? = 0 ] ; then
+            kv=($(echo $line | sed 's/=/\n/g'))
+            k=($(echo "${kv[0]}" | sed 's/\s//g'))
+            v=($(echo "${kv[1]}" | sed 's/^\s*//g' | sed 's/\s*$//g'))
+            # arr_kv[${kv[1]}]=${kv[2]}
+            arr_kv[${k}]=${v}
+        fi
+    done < <(jq -r '.body' ${pr_info_file})
+
+    echo ${arr_kv[PADDLEDOCS_PR]}
+    return 0
+}
+
+# Attention:
+# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs. 
+# 2. And /docs is used as the output of doc-build process.
+# 3. If conflicted with yours, please modify the defination of FLUIDDOCDIR and
+#    OUTPUTDIR in the subsequent codes.
+# 4. The doc-build process is controlled under EnvVar BUILD_DOC and UPLOAD_DOC.
+#    All the Chinese and English docs will be generated, and then uploaded.
+
+PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: None"
+BUILD_DOC=${BUILD_DOC:=false}
+UPLOAD_DOC=${UPLOAD_DOC:=false}
+
+CURPWD=${PWD}
+
+if [ -f /usr/local/python3.7.0/bin/sphinx-build ] ; then
+    if [ -f /usr/local/bin/sphinx-build ] ; then
+        rm /usr/local/bin/sphinx-build
+    fi
+    ln -s /usr/local/python3.7.0/bin/sphinx-build /usr/local/bin/sphinx-build
+fi
+
+if [ "${BUILD_DOC}" = "true" ] &&  [ -x /usr/local/bin/sphinx-build ] ; then
+    export FLUIDDOCDIR=${FLUIDDOCDIR:=/FluidDoc}
+    export OUTPUTDIR=${OUTPUTDIR:=/docs}
+    export VERSIONSTR=$(echo ${BRANCH} | sed 's@release/@@g')
+
+    if [ -d ${FLUIDDOCDIR} ] ; then
+        echo "${FLUIDDOCDIR} exists, git clone will be skipped, but git clean will be done."
+        cd ${FLUIDDOCDIR}
+        git reset --hard
+        git clean -dfx
+        cd ${CURPWD}
+    else
+        git clone -b ${BRANCH} --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR}
+        if [ ! "$?" = "0" ] ; then
+            git clone --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR}
+        fi
+    fi
+    if [ -d ${OUTPUTDIR} ] ; then
+        echo "$0: rm -rf ${OUTPUTDIR}"
+        rm -rf ${OUTPUTDIR}
+        mkdir -p ${OUTPUTDIR}
+    fi
+
+    # install requirements
+    export no_proxy=mirror.baidu.com,${no_proxy}
+    apt-get install -y --no-install-recommends doxygen jq
+    echo 'beautifulsoup4
+Markdown
+sphinx-sitemap
+sphinx-markdown-tables
+breathe
+exhale
+sphinx_design
+nbsphinx
+' >/tmp/doc-build.requirements && \
+    pip install --no-cache-dir -i https://mirror.baidu.com/pypi/simple -r /tmp/doc-build.requirements && \
+    rm /tmp/doc-build.requirements
+
+
+    source ${FLUIDDOCDIR}/ci_scripts/utils.sh
+    paddle_pr_info=$(get_repo_pr_info "PaddlePaddle/Paddle" ${GIT_PR_ID})
+    docs_pr_id=$(get_docs_pr_num_from_paddle_pr_info ${paddle_pr_info})
+    if [ -n "${docs_pr_id}" ] ; then
+        cd ${FLUIDDOCDIR}
+        git fetch --depth=1 origin pull/${docs_pr_id}/head
+        git checkout -b "pr${docs_pr_id}" FETCH_HEAD
+        git log --pretty=oneline -10
+    fi
+    echo "docs_pr_id=${docs_pr_id}"
+
+
+    # build doc
+    /bin/bash -x ${FLUIDDOCDIR}/ci_scripts/gendoc.sh
+    if [ $? -ne 0 ];then
+        echo 'gendoc error'
+        exit 1
+    fi
+
+    if [ "${UPLOAD_DOC}" = "true" ] ; then
+        curl -o /tmp/linux-bcecmd-0.3.0.zip https://sdk.bce.baidu.com/console-sdk/linux-bcecmd-0.3.0.zip && \
+        python -m zipfile -e /tmp/linux-bcecmd-0.3.0.zip /opt && \
+        chmod +x /opt/linux-bcecmd-0.3.0/bcecmd && \
+        rm /tmp/linux-bcecmd-0.3.0.zip && \
+        curl -o /tmp/boscmdconfig.tgz https://paddle-dev-tools-open.bj.bcebos.com/fluiddoc-preview/boscmdconfig.tgz && \
+        tar xzf /tmp/boscmdconfig.tgz -C /opt/linux-bcecmd-0.3.0/ && \
+        rm /tmp/boscmdconfig.tgz
+
+        # credentials file is empty, please build it if need.
+        BCECMD=/opt/linux-bcecmd-0.3.0/bcecmd
+        BCECMD_CONFIG=/opt/linux-bcecmd-0.3.0/boscmdconfig
+
+        is_shell_attribute_set x
+        xdebug_setted=$?
+        if [ $xdebug_setted ] ; then
+            set +x
+        fi
+        if [ -n "${BOS_CREDENTIAL_AK}" ] && [ -n "${BOS_CREDENTIAL_SK}" ] ; then
+            echo "Ak = ${BOS_CREDENTIAL_AK}" >> ${BCECMD_CONFIG}/credentials
+            echo "Sk = ${BOS_CREDENTIAL_SK}" >> ${BCECMD_CONFIG}/credentials
+        fi
+        if [ $xdebug_setted ] ; then
+            set -x
+        fi
+
+        PREVIEW_JOB_NAME="preview-paddle-pr-${GIT_PR_ID}"
+        BOSBUCKET=${BOSBUCKET:=paddle-site-web-dev}
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/"
+        ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \
+            --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/"
+        PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: http://${PREVIEW_JOB_NAME}.${PREVIEW_SITE:-paddle.run}/documentation/docs/zh/api/index_cn.html"
+    fi
+fi
+
+cd ${CURPWD}
+# print the preview url
+echo "${PREVIEW_URL_PROMPT}"
diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td
index 83e29578312969..f754767259563f 100644
--- a/tools/infrt/custom_pdop.td
+++ b/tools/infrt/custom_pdop.td
@@ -1,4 +1,4 @@
-def PD_FeedOp : PD_Op<"feed"> {
+def PD_FeedOp : PD_Op<"feed", [NoSideEffect]> {
   let summary = "Feed Op";
 
   let description = [{
@@ -33,7 +33,7 @@ def PD_ReturnOp : PD_Op<"return", [Terminator]> {
   let arguments = (ins Variadic<PD_Tensor>:$inputs);
 }
 
-def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"ReturnOp">]> {
+def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
   let summary = "paddle graph Op";
   let description = [{
     Describe a paddle graph or subgraph.
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index f3a78a8d4e8597..36561d4e71da8b 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -14,9 +14,16 @@
 
 import json
 import sys
-
-attr_type_converter = {"i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr'}
-supported_kernels = ['sign', 'dot', 'digamma', 'conj', 'abs', 'add_raw']
+import os
+from get_compat_kernel_signature import get_compat_kernels_info
+
+#TODO @DannyIsFunny: more attr types need to be supported.
+attr_type_converter = {
+    "i": 'SI32Attr',
+    "b": 'BoolAttr',
+    "l": 'SI64Attr',
+    "f": 'F32Attr'
+}
 
 target_type_converter = {"CPU": "CPU", "GPU": "GPU"}
 layout_type_converter = {
@@ -39,40 +46,34 @@
     "bool": "BOOL"
 }
 
+kernel_types_info_file = "./kernels.json"
+kernel_signature_info_file = "./kernel_signature.json"
+
 
 def generate_kernel_name(op_name, place_str):
     [target_, layout_, precision_] = place_str[1:-1].split(',')
     target_ = target_type_converter[target_.strip()]
     layout_ = layout_type_converter[layout_.strip()]
     precision_ = precision_type_converter[precision_.strip()]
+    class_name_ = "{}{}".format(
+        op_name.replace("_", "").title(), "".join([
+            target_.strip().title(), precision_.strip(), layout_.strip().title()
+            .title()
+        ]))
     alias_ = "{}.{}".format(op_name, ".".join(
-        [target_.strip(), layout_.strip(), precision_.strip()]))
-    return alias_
+        [target_.strip(), precision_.strip(), layout_.strip()]))
+    return alias_, class_name_
 
 
 def generate_attrs_info(op_name, attrs_info):
-    kernel_attrs_names = {
-        'split': ['sections', 'num', 'axis', 'mkldnn_data_type'],
-        'sign': [],
-        'masked_select': [],
-        'trace': ['offset', 'axis1', 'axis2'],
-        'concat': ['axis'],
-        'empty': ['shape', 'dtype'],
-        'conj': [],
-        'norm': ['axis', 'epsilon', 'is_test'],
-        'histogram': ['bins', 'min', 'max'],
-        'dot': [],
-        'scale': ['scale', 'bias', 'bias_after_scale'],
-        'digamma': [],
-        'lerp': [],
-        'cast': ['out_dtype', 'in_dtype'],
-        'abs': [],
-        'add_raw': ['axis'],
-    }
+    kernel_attrs_names = {}
     attrs_args_ = ""
-    if len(kernel_attrs_names[op_name]) == len(attrs_info):
+    with open(kernel_signature_info_file) as f:
+        kernel_attrs_names = json.load(f)
+        kernel_attrs_names.update(get_compat_kernels_info())
+    if len(kernel_attrs_names[op_name]["attrs"]) == len(attrs_info):
         for index in range(len(attrs_info)):
-            attr_name = kernel_attrs_names[op_name][index]
+            attr_name = kernel_attrs_names[op_name]["attrs"][index]
             attr_type = attr_type_converter[attrs_info[index]]
             attrs_args_ += '{type_}:${name_},'.format(
                 type_=attr_type, name_=attr_name)
@@ -97,7 +98,11 @@ def generate_arguments_info(op_name, input_info, attr_info):
     input_args = generate_inputs_info(input_info)
     attr_args = generate_attrs_info(op_name, attr_info)
     context_args = "Context:$dev_ctx"
-    argument_ = "{},{},{}".format(context_args, input_args, attr_args)
+    argument_list = [context_args] + input_args.split(",") + attr_args.split(
+        ",")
+    while ("" in argument_list):
+        argument_list.remove("")
+    argument_ = ",".join(argument_list)
     return (("let arguments = (ins {});".format(argument_.strip(","))))
 
 
@@ -116,6 +121,10 @@ def generate_results_info(output_info):
 
 def generate_supported_kernel_list(load_dict):
     supported_kernels_list_ = []
+    kernel_attrs_names = {}
+    with open(kernel_signature_info_file) as f:
+        kernel_attrs_names = json.load(f)
+        kernel_attrs_names.update(get_compat_kernels_info())
     for op_name in load_dict:
         kernel_list = load_dict[op_name]
         for kernel_info in kernel_list:
@@ -125,13 +134,10 @@ def generate_supported_kernel_list(load_dict):
                 for attribute in attributes:
                     if attribute not in attr_type_converter:
                         flag = False
-                if flag:
+                if flag and op_name in kernel_attrs_names:
                     supported_kernels_list_.append(op_name)
-
-                alias_ = generate_kernel_dialect(op_name, kernel_alias_,
-                                                 kernel_info[kernel_alias_])
     supported_kernels_list_ = list(set(supported_kernels_list_))
-    print(supported_kernels_list_)
+    return supported_kernels_list_
 
 
 def scan_kernel_info(load_dict):
@@ -156,16 +162,14 @@ def scan_kernel_info(load_dict):
 
 def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
 
-    alias = generate_kernel_name(op_name, kernel_alias_)
+    alias, class_name = generate_kernel_name(op_name, kernel_alias_)
     summary = 'let summary = "{name}";'.format(name=alias)
     dialect_name = alias.split(".")
     dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
         3]
 
     header = 'def {kernel_name} : PDTCPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
-        kernel_name=alias.replace(".", ""),
-        name=dialect_name.lower(),
-        left_brace="{")
+        kernel_name=class_name, name=dialect_name.lower(), left_brace="{")
 
     inputs_ = kernel_info["input"]
     attributes = kernel_info["attribute"]
@@ -185,16 +189,14 @@ def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
 
 def generate_gpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
 
-    alias = generate_kernel_name(op_name, kernel_alias_)
+    alias, class_name = generate_kernel_name(op_name, kernel_alias_)
     summary = 'let summary = "{name}";'.format(name=alias)
     dialect_name = alias.split(".")
     dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
         3]
 
     header = 'def {kernel_name} : PDTGPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
-        kernel_name=alias.replace(".", ""),
-        name=dialect_name.lower(),
-        left_brace="{")
+        kernel_name=class_name, name=dialect_name.lower(), left_brace="{")
     inputs_ = kernel_info["input"]
     attributes = kernel_info["attribute"]
     arguments = generate_arguments_info(op_name, inputs_, attributes)
@@ -236,14 +238,17 @@ def get_kernel_target(kernel_alias_):
     return target[0]
 
 
-def main(path_):
-    with open(path_, "r") as f:
+def main():
+    with open(kernel_types_info_file, "r") as f:
         load_dict = json.load(f)
 
         head = generate_dialect_head()
 
         cpu_registry_ = ""
         gpu_registry_ = ""
+        supported_kernels = generate_supported_kernel_list(load_dict)
+        print("Supported kernels:")
+        print(supported_kernels)
         for op_name in load_dict:
             if op_name not in supported_kernels:
                 continue
@@ -273,5 +278,12 @@ def main(path_):
 
 
 if __name__ == '__main__':
-    path = sys.argv[1]
-    main(path)
+    if not os.path.exists(kernel_types_info_file):
+        print("Error: '{file_name}' not exist!".format(
+            file_name=kernel_types_info_file))
+    if not os.path.exists(kernel_signature_info_file):
+        print("Error: '{file_name}' not exist!".format(
+            file_name=kernel_signature_info_file))
+    if os.path.exists(kernel_types_info_file) and os.path.exists(
+            kernel_signature_info_file):
+        main()
diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py
new file mode 100644
index 00000000000000..78d59c2aef10be
--- /dev/null
+++ b/tools/infrt/get_compat_kernel_signature.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import json
+
+
+def parse_compat_registry(kernel_info):
+    name, inputs_str, attrs_str, outputs_str = kernel_info.split(",{")
+    kernel_info = {}
+    kernel_info["inputs"] = inputs_str[:-1].split(",")
+    kernel_info["attrs"] = attrs_str[:-1].split(",")
+    kernel_info["outputs"] = outputs_str[:-1].split(",")
+    return name, kernel_info
+
+
+def remove_grad_registry(kernels_registry):
+    clean_kernel_registry = {}
+    for registry in kernels_registry:
+        if (not "_grad" in registry):
+            clean_kernel_registry[registry] = kernels_registry[registry]
+    return clean_kernel_registry
+
+
+def get_compat_kernels_info():
+    kernels_info = {}
+    compat_files = os.listdir("../../paddle/phi/ops/compat")
+    for file_ in compat_files:
+        if not ".cc" in file_:
+            compat_files.remove(file_)
+
+    for file_ in compat_files:
+        with open("../../paddle/phi/ops/compat/" + file_) as in_file:
+            txt = in_file.readlines()
+            content = ""
+            registry = False
+            for line in txt:
+                if ("KernelSignature(" in line):
+                    content = ""
+                    registry = True
+                if (registry):
+                    content += line
+                if (registry and ";" in line):
+                    data = content.replace("\n", "").replace(
+                        " ", "").strip("return").strip(
+                            "KernelSignature(").strip("\);").replace("\"", "")
+                    registry = False
+                    name, registry_info = parse_compat_registry(data)
+
+                    if name in kernels_info:
+                        cur_reg = kernels_info[name]
+                        kernels_info[name]["inputs"] = list(
+                            set(registry_info["inputs"] + kernels_info[name][
+                                "inputs"]))
+                        kernels_info[name]["attrs"] = list(
+                            set(registry_info["attrs"] + kernels_info[name][
+                                "attrs"]))
+                        kernels_info[name]["outputs"] = list(
+                            set(registry_info["outputs"] + kernels_info[name][
+                                "outputs"]))
+                    else:
+                        kernels_info[name] = registry_info
+
+    compat_registry_ = remove_grad_registry(kernels_info)
+    return compat_registry_
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index 9ea3fef003054c..774f6cd6bf3648 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -219,8 +219,8 @@ def gen_register_info(resources: List[List[str]]):
         for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
             kernel_func = gen_kernel_func(update_item[3], ctx_name,
                                           origin_dtype)
-            ir_name = 'phi_cpu.' + update_item[0].lower() + '.' + update_item[
-                2].lower() + '.' + ir_dtype
+            ir_name = 'phi_cpu.' + update_item[0].lower(
+            ) + '.' + ir_dtype + '.' + update_item[2].lower()
             res += f"""
   registry->AddKernel("{ir_name}","""
 
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index d8cb70c9dd107b..2d8692c5bc7e5c 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -550,6 +550,42 @@ def get_incrementapi():
                 f.write('\n')
 
 
+def exec_gen_doc():
+    result = True
+    cmd = ["bash", "document_preview.sh"]
+    logger.info("----exec gen_doc----")
+    start_time = time.time()
+    subprc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output, error = subprc.communicate()
+    msg = "".join(output.decode(encoding='utf-8'))
+    err = "".join(error.decode(encoding='utf-8'))
+    end_time = time.time()
+
+    if subprc.returncode != 0:
+        logger.info("----gen_doc msg----")
+        logger.info(msg)
+        logger.error("----gen_doc error msg----")
+        logger.error(err)
+        logger.error("----exec gen_doc failed----")
+        result = False
+    else:
+        logger.info("----gen_doc msg----")
+        logger.info(msg)
+        logger.info("----exec gen_doc success----")
+
+    for fn in [
+            '/docs/en/develop/index_en.html', '/docs/zh/develop/index_cn.html'
+    ]:
+        if os.path.exists(fn):
+            logger.info('%s exists.', fn)
+        else:
+            logger.error('%s not exists.', fn)
+
+    # msg is the returned code execution report
+    return result, msg, end_time - start_time
+
+
 arguments = [
     # flags, dest, type, default, help
     ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'],
@@ -570,6 +606,11 @@ def parse_args():
     parser.add_argument('--debug', dest='debug', action="store_true")
     parser.add_argument('--full-test', dest='full_test', action="store_true")
     parser.add_argument('mode', type=str, help='run on device', default='cpu')
+    parser.add_argument(
+        '--build-doc',
+        dest='build_doc',
+        action='store_true',
+        help='build doc if need.')
     for item in arguments:
         parser.add_argument(
             item[0], dest=item[1], help=item[4], type=item[2], default=item[3])
@@ -702,3 +743,7 @@ def parse_args():
             exit(1)
 
     logger.info("Sample code check is successful!")
+
+    if args.mode == "cpu":
+        # As cpu mode is also run with the GPU whl, so skip it in gpu mode.
+        exec_gen_doc()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 7356f0c8db0255..365047f7e8382a 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -729,4 +730,6 @@
     'test_lu_op',
     'test_margin_cross_entropy_op',
     'test_pull_gpups_sparse_op',
+    'test_fused_gemm_epilogue_op',
+    'test_fused_gemm_epilogue_grad_op',
 ]