diff --git a/.gitignore b/.gitignore
index 801790d0a4720..664c45b7202f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,12 +52,12 @@ tools/__pycache__
 
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
-paddle/infrt/dialect/pd_ops.td
+paddle/infrt/dialect/pd/ir/pd_ops.td
 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
 tools/infrt/kernels.json
 tools/infrt/kernel_signature.json
-paddle/infrt/dialect/pd_ops_info.h
+paddle/infrt/dialect/pd/common/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
 paddle/infrt/tests/lit.cfg.py
diff --git a/README.md b/README.md
index cdbf2d9f3bf99..c4c5decec5430 100644
--- a/README.md
+++ b/README.md
@@ -14,9 +14,8 @@ English | [简体中文](./README_cn.md)
 
 Welcome to the PaddlePaddle GitHub.
 
-PaddlePaddle, as the only independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
-PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
-
+PaddlePaddle, as the first independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
+PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers, 157,000 companies and generating 476,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
 
 
 ## Installation
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 312a030524468..e09429bc42957 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -11,7 +11,7 @@ elseif(NEW_RELEASE_ALL)
   add_definitions(-DNEW_RELEASE_ALL)
   set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
   set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
-  set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
+  set(paddle_known_gpu_archs11 "35 50 60 61 70 75 80")
 elseif(NEW_RELEASE_PYPI)
   message("Using New Release Strategy - Cubin Packge")
   add_definitions(-DNEW_RELEASE_PYPI)
diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
index 661c3675c84b2..ba6f0396008fc 100644
--- a/cmake/external/paddle2onnx.cmake
+++ b/cmake/external/paddle2onnx.cmake
@@ -61,6 +61,7 @@ set(PADDLE2ONNX_OPTIONAL_ARGS
       -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
       -DWITH_STATIC=OFF
       -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
+      -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}
       -DCMAKE_POSITION_INDEPENDENT_CODE=ON
       -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
       ${EXTERNAL_OPTIONAL_ARGS}
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 851bd81403a85..cafd1406b256f 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -258,6 +258,12 @@ copy(inference_lib_dist
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+        copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/none.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/extension.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index ebb686d8ad0f3..1c4dd723b9b71 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -118,7 +118,7 @@ function(kernel_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
             list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
         endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP)
             list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
         endif()
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
@@ -151,6 +151,9 @@ function(kernel_library TARGET)
                 file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
                 list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
             endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc )
+                list(APPEND kps_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+            endif()
         endif()
     else()
         # TODO(chenweihang): impl compile by source later
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 3fca45cc068f9..49ba9479d49e9 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,5 +1,5 @@
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
-cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api)
+cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api string_helper)
 
 if (WITH_DISTRIBUTE)
   cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index 5dc43af117825..cb82677a281e9 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -171,10 +171,10 @@ ProcessGroupGloo::GlooTask::GlooTask(int rank,
                         "Only CPU place is supported for ProcessGroupGloo."));
 }
 
-ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
-                                   int rank, int world_size,
-                                   const std::shared_ptr<GlooOptions> options)
-    : ProcessGroup(rank, world_size), _tag(0), _store(store) {
+ProcessGroupGloo::ProcessGroupGloo(
+    const std::shared_ptr<paddle::distributed::Store>& store, int rank,
+    int world_size, const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size), _tag(0), _store(new GlooStore(store)) {
   _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
   auto prefix_store =
       ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
index 24f156571a427..71e0a40f8a761 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -52,8 +52,7 @@ class ProcessGroupGloo : public ProcessGroup {
 
   class GlooStore : public ::gloo::rendezvous::Store {
    public:
-    explicit GlooStore(
-        const std::shared_ptr<paddle::distributed::TCPStore>& store)
+    explicit GlooStore(const std::shared_ptr<paddle::distributed::Store>& store)
         : _store(store) {}
 
     ~GlooStore() = default;
@@ -87,7 +86,7 @@ class ProcessGroupGloo : public ProcessGroup {
     }
 
    protected:
-    std::shared_ptr<paddle::distributed::TCPStore> _store;
+    std::shared_ptr<paddle::distributed::Store> _store;
   };
 
   class GlooOptions {
@@ -100,9 +99,9 @@ class ProcessGroupGloo : public ProcessGroup {
     std::shared_ptr<::gloo::transport::Device> device;
   };
 
-  explicit ProcessGroupGloo(const std::shared_ptr<GlooStore>& store, int rank,
-                            int world_size,
-                            std::shared_ptr<GlooOptions> options);
+  explicit ProcessGroupGloo(
+      const std::shared_ptr<paddle::distributed::Store>& store, int rank,
+      int world_size, std::shared_ptr<GlooOptions> options);
 
   ~ProcessGroupGloo() = default;
 
@@ -145,7 +144,7 @@ class ProcessGroupGloo : public ProcessGroup {
  protected:
   uint32_t _tag;
   std::shared_ptr<gloo::rendezvous::Context> _context;
-  std::shared_ptr<GlooStore> _store;
+  std::shared_ptr<::gloo::rendezvous::Store> _store;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 7f21bcee87ab7..2af407f711ec1 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -395,7 +395,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
   platform::CUDADeviceGuard gpuGuard;
   for (auto& place : places) {
     gpuGuard.SetDeviceIndex(place.GetDeviceId());
-    auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::Backend::GPU);
+    auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::GPUPlace());
     barrierTensors.push_back(dt);
   }
   auto task = ProcessGroupNCCL::AllReduce(barrierTensors);
@@ -417,7 +417,7 @@ void CheckTensorsInDifferentDevices(const std::vector<Tensor>& tensors,
   std::set<Place> used_devices;
 
   for (const auto& t : tensors) {
-    PADDLE_ENFORCE_EQ(t.is_cuda() && t.is_dense_tensor(), true,
+    PADDLE_ENFORCE_EQ(t.is_gpu() && t.is_dense_tensor(), true,
                       platform::errors::InvalidArgument(
                           "Tensors must be CUDA and dense tensor."));
 
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 5533f3f4cbf4b..79961cca85ae0 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -17,6 +17,20 @@
 namespace paddle {
 namespace distributed {
 
+static Backend TransToBackend(platform::Place place) {
+  static const std::map<phi::AllocationType, Backend> type_backend = {
+      {phi::AllocationType::GPU, Backend::GPU},
+      {phi::AllocationType::CPU, Backend::CPU},
+  };
+
+  phi::AllocationType type = place.GetType();
+  auto it = type_backend.find(type);
+  PADDLE_ENFORCE_EQ(it != type_backend.end(), true,
+                    platform::errors::InvalidArgument(
+                        "Place type (%s) is not supported. ", place));
+  return it->second;
+}
+
 std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
     const std::vector<Tensor> tensors,
     const std::vector<bool> &is_sparse_gradient,
@@ -297,10 +311,18 @@ EagerReducer::EagerReducer(
         std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
     accumulation_grad_node->RegisterReduceHook(
         std::make_shared<egr::CppTensorVoidHook>(reduce_hook));
+
+    gradnode_index_map_[grad_node.get()] = global_var_index;
   }
 
   vars_marked_ready_.resize(tensors_.size(), false);
   local_used_vars_.resize(tensors_.size(), 0);
+
+  if (find_unused_vars_each_step_) {
+    global_used_vars_ = paddle::experimental::empty(
+        ScalarArray({static_cast<int32_t>(tensors_.size())}), DataType::INT32,
+        inner_place_);
+  }
 }
 
 std::shared_ptr<egr::GradNodeBase> EagerReducer::GetGradNodeFromTensor(
@@ -341,21 +363,8 @@ void EagerReducer::InitializeGroups(
     } else {
       // process the dense gradient.
       InitializeDenseGroups(tensor_indices_, &group);
-      experimental::Backend backend;
-      switch (inner_place_.GetType()) {
-        case phi::AllocationType::GPU:
-          backend = experimental::Backend::GPU;
-          break;
-        case phi::AllocationType::CPU:
-          backend = experimental::Backend::CPU;
-          break;
-        default:
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Place type (%s) is not supported. ", inner_place_));
-          break;
-      }
       group.dense_contents_ = paddle::experimental::empty(
-          ScalarArray({group.all_length_}), group.dtype_, backend);
+          ScalarArray({group.all_length_}), group.dtype_, inner_place_);
     }
 
     // map tensors to this group by VariableLocator
@@ -418,6 +427,53 @@ void EagerReducer::InitializeDenseGroups(
   p_group->all_length_ = all_length;
 }
 
+void EagerReducer::TraverseBackwardGraph(const std::vector<Tensor> &outputs) {
+  std::queue<egr::GradNodeBase *> queue;
+  std::set<egr::GradNodeBase *> visited;
+
+  for (const auto &output : outputs) {
+    auto *auto_grad_meta =
+        static_cast<egr::AutogradMeta *>(output.get_autograd_meta());
+    if (!auto_grad_meta) continue;
+    auto shared_grad_node = auto_grad_meta->GetMutableGradNode();
+    if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
+        auto_grad_meta->StopGradient()) {
+      continue;
+    }
+    egr::GradNodeBase *grad_node = shared_grad_node.get();
+    queue.emplace(grad_node);
+  }
+
+  while (!queue.empty()) {
+    egr::GradNodeBase *node = queue.front();
+    queue.pop();
+    const std::vector<std::vector<egr::Edge>> &edges = node->GetEdges();
+    for (size_t i = 0; i < edges.size(); i++) {
+      for (size_t j = 0; j < edges[i].size(); j++) {
+        const egr::Edge &edge = edges[i][j];
+        auto next_node_shared = edge.GetMutableGradNode();
+        if (!next_node_shared || !next_node_shared.get()) {
+          continue;
+        }
+        auto *next_node = next_node_shared.get();
+        const bool was_inserted = visited.insert(next_node).second;
+        if (was_inserted) {
+          queue.emplace(next_node);
+        }
+      }
+    }
+  }
+
+  for (const auto &it : gradnode_index_map_) {
+    if (visited.count(it.first) == 0) {
+      unused_vars_.push_back(it.second);
+      VLOG(3) << "[Rank " << process_group_->GetRank() << "]: "
+              << "Tensor " << tensors_[it.second].name() << " at index "
+              << it.second << " is marked as unused.";
+    }
+  }
+}
+
 void EagerReducer::PrepareForBackward(const std::vector<Tensor> &outputs) {
   VLOG(3) << "after forward, then reset count for backward.";
   grad_need_hooks_ = true;
@@ -429,6 +485,51 @@ void EagerReducer::PrepareForBackward(const std::vector<Tensor> &outputs) {
   // reinitialize vars_marked_ready_ for next iteration
   vars_marked_ready_.clear();
   vars_marked_ready_.resize(tensors_.size(), false);
+
+  PADDLE_ENFORCE_EQ(
+      groups_need_finalize_, false,
+      platform::errors::PreconditionNotMet(
+          "A serious error has occurred here. Please "
+          "set find_unused_parameters=True to traverse backward graph "
+          "in each step to prepare reduce in advance. If you have "
+          "set, There may be several reasons for this error: "
+          "1) Please note that all forward outputs derived from the module "
+          "parameters must participate in the calculation of losses and "
+          "subsequent gradient calculations. If not, the wrapper will hang, "
+          "waiting for autograd to generate gradients for these parameters. "
+          "you can use detach or stop_gradient to make the unused parameters "
+          "detached from the autograd graph. "
+          "2) Used multiple forwards and one backward. You may be able to wrap "
+          "multiple forwards in a model."));
+
+  // The first var to trigger the unused parameter
+  has_marked_unused_vars_ = false;
+
+  if (find_unused_vars_once_ || find_unused_vars_each_step_) {
+    unused_vars_.clear();
+    TraverseBackwardGraph(outputs);
+    // only check once in first step
+    find_unused_vars_once_ = false;
+  }
+
+  if (find_unused_vars_each_step_ && unused_vars_.empty()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "All parameters are involved in the backward pass. "
+           "It is recommended to set find_unused_parameters to False "
+           "to improve performance. However, if unused parameters "
+           "appear in subsequent iterative training, then an error "
+           "will occur. Please make it clear that in the subsequent "
+           "training, there will be no parameters that are not used "
+           "in the backward pass, and then set find_unused_parameters";
+  }
+
+  if (unused_vars_.size() == tensors_.size()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "There is no parameter in the device involved "
+           "in the backward calculation. If there are "
+           "parameters on other devices involved in the "
+           "backward, then a serious error will occur here.";
+  }
 }
 
 void EagerReducer::AddDistHook(size_t var_index) {
@@ -446,36 +547,104 @@ void EagerReducer::AddDistHook(size_t var_index) {
   auto &tensor = tensors_[var_index];
   const auto &grad_node = GetGradNodeFromTensor(&tensor);
 
-  VLOG(3) << "Var[" << var_index << "] [" << (*grad_node).name()
-          << "] arrived and triggered disthook";
+  VLOG(3) << "Tensor[" << var_index << "] [" << tensors_[var_index].name()
+          << "@Grad] arrived and triggered disthook";
 
   local_used_vars_[var_index] = 1;
 
+  if (!has_marked_unused_vars_) {
+    has_marked_unused_vars_ = true;
+    for (const auto unused_index : unused_vars_) {
+      MarkVarReady(unused_index, false);
+    }
+  }
   MarkVarReady(var_index, true);
 }
 
 void EagerReducer::MarkVarReady(const size_t var_index,
                                 const bool is_used_var) {
+  VLOG(3) << "Tensor[" << var_index << "][" << tensors_[var_index].name()
+          << "] is marked ready.";
+  // error happened, if the var is ready before.
+  if (vars_marked_ready_[var_index]) {
+    auto error_info = string::Sprintf(
+        "Error happened, when parameter[%d][%s] has been ready before. "
+        "Please set find_unused_parameters=True to traverse backward graph "
+        "in each step to prepare reduce in advance. If you have set, "
+        "there may be several reasons for this error: "
+        "1) In multiple reentrant backward phase, some parameters are reused."
+        "2) Using model parameters outside of forward function. Please "
+        "make sure that model parameters are not shared in concurrent "
+        "forward-backward passes.",
+        var_index, tensors_[var_index].name());
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false,
+                      platform::errors::PreconditionNotMet(error_info));
+
+    error_info +=
+        "3) Unused parameters retrieval is incorrect. "
+        "The return value of forward will be used to retrieve"
+        " the unused parameters of the entire model. These "
+        "gradients of unused parameters will not be synchronized "
+        "between multiple cards. However, if the unused "
+        "parameters participate in the backward calculation "
+        "again at a later time (e.g. after the forward function, "
+        "the loss calculation uses the unused "
+        "paramters of the forward and trigger backward), "
+        "its gradient will be wrong.";
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true,
+                      platform::errors::PreconditionNotMet(error_info));
+  } else {
+    vars_marked_ready_[var_index] = true;
+  }
+  groups_need_finalize_ = true;
+
   const auto &var_locator = variable_locators_[var_index];
   const auto group_index = var_locator.group_index;
   const auto inside_group_index = var_locator.inside_group_index;
 
   auto &group = groups_[group_index];
   auto &group_tensor = group.dense_tensors_[inside_group_index];
-  auto *autograd_meta = tensors_[var_index].get_autograd_meta();
-  auto &grad_tensor = static_cast<egr::AutogradMeta *>(autograd_meta)->Grad();
-
-  group_tensor
-      .ShareDataWith(
-          *(std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor.impl())))
-      .Resize({grad_tensor.numel()});
-
-  vars_marked_ready_[var_index] = true;
+  const auto length = group.length_[inside_group_index];
+
+  if (is_used_var) {
+    auto *autograd_meta = tensors_[var_index].get_autograd_meta();
+    auto &grad_tensor = static_cast<egr::AutogradMeta *>(autograd_meta)->Grad();
+    group_tensor
+        .ShareDataWith(
+            *(std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor.impl())))
+        .Resize({grad_tensor.numel()});
+  } else {
+    // TODO(shenliang03): maybe save the memory by avoiding tensor construction
+    if (!group_tensor.initialized()) {
+      group_tensor.Resize({static_cast<int64_t>(length)});
+      group_tensor.mutable_data(inner_place_, group.dtype_);
+    }
+    if (HasGrad(var_index)) {
+      VLOG(3) << "Tensor[" << tensors_[var_index].name() << "] has grad";
+      auto grad_tensor = egr::EagerUtils::mutable_grad(tensors_[var_index]);
+      group_tensor
+          .ShareDataWith(*(
+              std::dynamic_pointer_cast<phi::DenseTensor>(grad_tensor->impl())))
+          .Resize({length});
+    } else {
+      VLOG(3) << "Tensor[" << tensors_[var_index].name()
+              << "] doesn't have grad";
+      auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_);
+      group_tensor.Resize({static_cast<int64_t>(length)});
+      phi::funcs::set_constant(*dev_ctx, &group_tensor, 0.0);
+    }
+  }
 
   if (--group.pending_ == 0) {
     // can start allreduce
     MarkGroupReady(group_index);
   }
+
+  if (next_group_ == groups_.size()) {
+    FinalizeBackward();
+  }
 }
 
 void EagerReducer::MarkGroupReady(size_t group_index) {
@@ -501,6 +670,92 @@ void EagerReducer::MarkGroupReady(size_t group_index) {
   }
 }
 
+bool EagerReducer::HasGrad(size_t var_index) {
+  auto grad = egr::EagerUtils::mutable_grad(tensors_[var_index]);
+  if (grad && grad->is_initialized()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void EagerReducer::ProcessUnusedDenseVars() {
+  // The calculation stream must be used here to
+  // avoid conflicts with communication.
+  VLOG(3) << "Local used vars : "
+          << string::join_strings(local_used_vars_, ',');
+
+  const auto *dev_ctx =
+      platform::DeviceContextPool::Instance().Get(inner_place_);
+  auto *global_used_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(global_used_vars_.impl())
+          .get();
+  framework::TensorFromVector<int32_t>(local_used_vars_, *dev_ctx,
+                                       global_used_tensor);
+
+  distributed::AllreduceOptions opts;
+  opts.reduce_op = ReduceOp::SUM;
+  std::vector<Tensor> reduce_tensors = {global_used_vars_};
+  process_group_->AllReduce(reduce_tensors, opts)->Synchronize();
+
+  framework::TensorToVector<int>(*global_used_tensor, *dev_ctx,
+                                 &local_used_vars_);
+  dev_ctx->Wait();
+
+  // sync compute stream to get global used var message,
+  // but maybe affect speed performance
+  VLOG(3) << "Global used vars : "
+          << string::join_strings(local_used_vars_, ',');
+
+  for (const auto var_index : unused_vars_) {
+    const bool global_unused = (local_used_vars_[var_index] == 0);
+
+    // global used but local unused, set grad
+    VLOG(3) << "[Rank " << process_group_->GetRank() << "]: "
+            << "Var [" << var_index << "] [" << tensors_[var_index].name()
+            << "] global_unused: " << global_unused
+            << "  has grad: " << HasGrad(var_index);
+
+    if (!global_unused) {
+      VLOG(3) << "Set Tensor[" << var_index << "]'s Grad for [Rank "
+              << process_group_->GetRank() << "]";
+      const auto &var_locator = variable_locators_[var_index];
+      const auto group_index = var_locator.group_index;
+      const auto &group = groups_[group_index];
+      const auto inside_group_index = var_locator.inside_group_index;
+      auto &src_tensor = group.dense_tensors_[inside_group_index];
+
+      Tensor grad_value(std::make_shared<phi::DenseTensor>(src_tensor));
+
+      auto dest_var_base = tensors_[var_index];
+      auto grad_tensor = egr::EagerUtils::mutable_grad(dest_var_base);
+      grad_tensor->copy_(grad_value, inner_place_, true);
+      grad_tensor->reshape(dest_var_base.shape());
+    }
+  }
+}
+
+void EagerReducer::FinalizeBackward() {
+  groups_need_finalize_ = false;
+  grad_need_hooks_ = false;
+  for (auto &group : groups_) {
+    group.task->Synchronize();
+  }
+
+  for (auto &group : groups_) {
+    group.SplitTensors(inner_place_);
+  }
+
+  if (find_unused_vars_each_step_) {
+    ProcessUnusedDenseVars();
+    local_used_vars_.clear();
+    local_used_vars_.resize(tensors_.size(), 0);
+    VLOG(3) << "ProcessUnusedDenseVars is finished.";
+  }
+
+  VLOG(3) << "In the batch, Reducer is finished.";
+}
+
 void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
                                           const int curr_group_index) {
   // The overall timeline: concat > div_nranks > allreduce > split
@@ -513,24 +768,14 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
   group->ConcatTensors(inner_place_);
 
   // div nranks
-  double scaling = 1.0 / nranks_;
-  paddle::experimental::scale_(group->dense_contents_, scaling, 0.0, false);
+  paddle::experimental::scale_(group->dense_contents_, 1.0 / nranks_, 0.0,
+                               false);
 
   // all_reduce
   std::vector<Tensor> reduce_tensors = {group->dense_contents_};
-  tasks_.push_back(process_group_->AllReduce(reduce_tensors, opts));
+  group->task = process_group_->AllReduce(reduce_tensors, opts);
 
-  if (tasks_.size() == groups_.size()) {
-    for (size_t index = 0; index < tasks_.size(); index++) {
-      auto &task = tasks_.back();
-      task->Synchronize();
-      tasks_.pop_back();
-    }
-    for (size_t index = 0; index < groups_.size(); index++) {
-      auto &group = groups_[index];
-      group.SplitTensors(inner_place_);
-    }
-  }
+  // split in FinalizeBackward()
 }
 
 std::ostream &operator<<(std::ostream &out, const EagerGroup &group) {
diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h
index ac6f3fbe5956c..d3ffa8498a14b 100644
--- a/paddle/fluid/distributed/collective/reducer.h
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -28,6 +28,8 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/ext_compat_utils.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/utils/string/string_helper.h"
 
 namespace paddle {
 namespace distributed {
@@ -35,6 +37,7 @@ using Tensor = paddle::experimental::Tensor;
 using Scalar = paddle::experimental::ScalarBase<paddle::experimental::Tensor>;
 using ScalarArray =
     paddle::experimental::ScalarArrayBase<paddle::experimental::Tensor>;
+using Backend = paddle::experimental::Backend;
 
 std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
     const std::vector<Tensor>, const std::vector<bool> &is_sparse_gradient,
@@ -61,6 +64,9 @@ class EagerGroup {
   // external message of group
   phi::DataType dtype_;
 
+  // help to sync
+  std::shared_ptr<ProcessGroup::Task> task;
+
   // context is used to select the stream for concat
   void ConcatTensors(const platform::Place &);
 
@@ -98,6 +104,10 @@ class EagerReducer {
   void MarkVarReady(const size_t var_index, const bool is_used_var);
   void MarkGroupReady(const size_t group_index);
   void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index);
+  void FinalizeBackward();
+  void TraverseBackwardGraph(const std::vector<Tensor> &outputs);
+  void ProcessUnusedDenseVars();
+  bool HasGrad(size_t var_index);
 
  private:
   std::vector<Tensor> tensors_;
@@ -105,7 +115,6 @@ class EagerReducer {
   std::vector<bool> is_sparse_gradient_;
   std::shared_ptr<distributed::ProcessGroup> process_group_;
   std::vector<size_t> group_size_limits_;
-  bool find_unused_vars_each_step_;
 
   std::vector<EagerGroup> groups_;
   std::vector<TensorLocator> variable_locators_;
@@ -113,12 +122,20 @@ class EagerReducer {
   platform::Place inner_place_;
   size_t next_group_ = 0;
   int64_t nranks_ = -1;
-  std::vector<std::shared_ptr<paddle::distributed::ProcessGroup::Task>> tasks_;
 
   bool grad_need_hooks_{false};
 
   std::vector<bool> vars_marked_ready_;
-  std::vector<int> local_used_vars_;
+  std::vector<int32_t> local_used_vars_;
+
+  // Following variables are to help unused vars
+  std::vector<size_t> unused_vars_;
+  std::map<egr::GradNodeBase *, size_t> gradnode_index_map_;
+  bool has_marked_unused_vars_{false};
+  bool find_unused_vars_each_step_{false};
+  bool find_unused_vars_once_{true};
+  bool groups_need_finalize_{false};
+  Tensor global_used_vars_;
 };
 
 }  //  namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 3e734b1b9ed24..8641b36a1be8e 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -4,7 +4,7 @@ if(WITH_PYTHON)
 endif()
 proto_library(interceptor_message_proto SRCS interceptor_message.proto)
 
-if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+if(WITH_DISTRIBUTE AND WITH_PSCORE)
   set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog)
 else()
   set(BRPC_DEPS "")
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index 8d2ec5c41d864..80a6b4667aa1a 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -67,8 +67,7 @@ bool MessageBus::IsInit() const { return is_init_; }
 
 MessageBus::~MessageBus() {
   VLOG(3) << "Message bus releases resource.";
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   server_.Stop(1000);
   server_.Join();
 #endif
@@ -87,8 +86,7 @@ bool MessageBus::Send(int64_t dst_rank,
       IsInit(), true,
       platform::errors::PreconditionNotMet(
           "Using message bus since it has not been initialized."));
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   int retry_time = 0;  // message bus will retry sending for 10 times
   while (retry_time < 10) {
     ++retry_time;
@@ -173,8 +171,7 @@ void MessageBus::ListenPort() {
     LOG(INFO) << "No need listen to port since training on single card.";
     return;
   }
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // function keep listen the port and handle the message
   PADDLE_ENFORCE_EQ(
       server_.AddService(&message_service_, brpc::SERVER_DOESNT_OWN_SERVICE), 0,
@@ -203,8 +200,7 @@ void MessageBus::ListenPort() {
 #endif
 }
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 bool MessageBus::SendInterRank(int64_t dst_rank,
                                const InterceptorMessage& interceptor_message) {
   const auto& dst_addr = GetAddr(dst_rank);
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h
index d805ac81606b8..dfd65fdbc00d4 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.h
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.h
@@ -20,8 +20,7 @@
 #include <thread>
 #include <unordered_map>
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "brpc/channel.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
@@ -64,8 +63,7 @@ class MessageBus final {
 
   const std::string& GetAddr(int64_t rank) const;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // send the message inter rank (dst is different rank with src)
   bool SendInterRank(int64_t dst_rank,
                      const InterceptorMessage& interceptor_message);
@@ -81,8 +79,7 @@ class MessageBus final {
   // the ip needs to be listened
   std::string addr_;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   MessageServiceImpl message_service_;
   // brpc server
   brpc::Server server_;
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc
index c3fff98f684ad..1c66d83ea34d7 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_service.cc
@@ -11,8 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.h b/paddle/fluid/distributed/fleet_executor/message_service.h
index 02f73471e3b91..5ab687ff93dc4 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.h
+++ b/paddle/fluid/distributed/fleet_executor/message_service.h
@@ -11,8 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #pragma once
 
 #include "brpc/server.h"
diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto
index 0ae87812bce43..fac30e26c388c 100644
--- a/paddle/fluid/distributed/ps.proto
+++ b/paddle/fluid/distributed/ps.proto
@@ -115,6 +115,7 @@ message TableParameter {
   optional CommonAccessorParameter common = 6;
   optional TableType type = 7;
   optional bool compress_in_save = 8 [ default = false ];
+  optional GraphParameter graph_parameter = 9;
 }
 
 message TableAccessorParameter {
@@ -211,3 +212,25 @@ message SparseAdamSGDParameter { // SparseAdamSGDRule
   optional double ada_epsilon = 5 [ default = 1e-08 ];
   repeated float weight_bounds = 6;
 }
+
+message GraphParameter {
+  optional int32 task_pool_size = 1 [ default = 24 ];
+  optional bool gpups_mode = 2 [ default = false ];
+  optional string gpups_graph_sample_class = 3
+      [ default = "CompleteGraphSampler" ];
+  optional string gpups_graph_sample_args = 4 [ default = "" ];
+  optional bool use_cache = 5 [ default = true ];
+  optional float cache_ratio = 6 [ default = 0.3 ];
+  optional int32 cache_ttl = 7 [ default = 5 ];
+  optional GraphFeature graph_feature = 8;
+  optional string table_name = 9 [ default = "" ];
+  optional string table_type = 10 [ default = "" ];
+  optional int32 gpups_mode_shard_num = 11 [ default = 127 ];
+  optional int32 gpu_num = 12 [ default = 1 ];
+}
+
+message GraphFeature {
+  repeated string name = 1;
+  repeated string dtype = 2;
+  repeated int32 shape = 3;
+}
\ No newline at end of file
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index b8ccd8e744dab..f86b4b706b3e2 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -414,6 +414,16 @@ std::future<int32_t> BrpcPsClient::load(uint32_t table_id,
   return send_cmd(table_id, PS_LOAD_ONE_TABLE, {epoch, mode});
 }
 
+std::future<int32_t> BrpcPsClient::Load(const LoadSaveContext &load_context) {
+  if (load_context.table_id < 0) {
+    return send_cmd(-1, PS_LOAD_ALL_TABLE,
+                    {load_context.epoch, load_context.mode});
+  } else {
+    return send_cmd(load_context.table_id, PS_LOAD_ONE_TABLE,
+                    {load_context.epoch, load_context.mode});
+  }
+}
+
 std::future<int32_t> BrpcPsClient::save(const std::string &epoch,
                                         const std::string &mode) {
   VLOG(1) << "BrpcPsClient::save path " << epoch;
@@ -427,6 +437,19 @@ std::future<int32_t> BrpcPsClient::save(uint32_t table_id,
   return send_save_cmd(table_id, PS_SAVE_ONE_TABLE, {epoch, mode});
 }
 
+std::future<int32_t> BrpcPsClient::Save(const LoadSaveContext &save_context) {
+  if (save_context.table_id < 0) {
+    VLOG(1) << "BrpcPsClient::save path " << save_context.epoch;
+    return send_save_cmd(-1, PS_SAVE_ALL_TABLE,
+                         {save_context.epoch, save_context.mode});
+  } else {
+    VLOG(1) << "BrpcPsClient::save one table path " << save_context.epoch
+            << " table_id " << save_context.table_id;
+    return send_save_cmd(save_context.table_id, PS_SAVE_ONE_TABLE,
+                         {save_context.epoch, save_context.mode});
+  }
+}
+
 std::future<int32_t> BrpcPsClient::clear() {
   return send_cmd(-1, PS_CLEAR_ALL_TABLE, {});
 }
@@ -505,6 +528,44 @@ std::future<int32_t> BrpcPsClient::barrier(size_t table_id,
   return send_cmd(table_id, PS_BARRIER, {std::to_string(barrier_type)});
 }
 
+std::future<int32_t> BrpcPsClient::Pull(RequestContext &pull_context) {
+  if (pull_context.value_type == Dense) {  // pull dense
+    Region *dense_region =
+        reinterpret_cast<Region *>(pull_context.dense_values);
+    pull_dense(dense_region, pull_context.num, pull_context.table);
+  } else {  // pull sparse
+    uint64_t *keys = reinterpret_cast<uint64_t *>(pull_context.keys);
+    float **select_values =
+        reinterpret_cast<float **>(pull_context.sparse_values);
+    size_t table_id = pull_context.table;
+    size_t num = pull_context.num;
+    bool is_training = pull_context.is_training;
+    if (pull_context.training_mode == Geo) {  // for geo
+      pull_sparse_param(select_values, table_id, keys, num, is_training);
+    } else if (pull_context.training_mode == Async) {  // for async
+      pull_sparse(select_values, table_id, keys, num, is_training);
+    }
+  }
+}
+
+std::future<int32_t> BrpcPsClient::Push(RequestContext &push_context) {
+  if (push_context.value_type == Dense) {  // push dense
+    const Region *dense_region = push_context.push_context.push_dense_values;
+    push_dense(dense_region, push_context.num, push_context.table);
+  } else {  // push sparse
+    size_t table_id = push_context.table;
+    size_t num = push_context.num;
+    bool is_training = push_context.is_training;
+    if (push_context.training_mode == Geo) {  // for geo
+      // TODO(zhaocaibei)
+    } else if (push_context.training_mode == Async) {  // for async
+      const uint64_t *keys = push_context.push_context.keys;
+      const float **update_values = push_context.push_context.push_values;
+      push_sparse(table_id, keys, update_values, num);
+    }
+  }
+}
+
 std::future<int32_t> BrpcPsClient::pull_geo_param(size_t table_id,
                                                   std::vector<float> *values,
                                                   std::vector<uint64_t> *keys,
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
index 59ed59933db86..8b0cb0741b400 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
@@ -163,12 +163,17 @@ class BrpcPsClient : public PSClient {
   std::future<int32_t> load(uint32_t table_id, const std::string &epoch,
                             const std::string &mode) override;
 
+  std::future<int32_t> Load(const LoadSaveContext &load_context) override;
+
   std::future<int32_t> save(const std::string &epoch,
                             const std::string &mode) override;
 
   std::future<int32_t> save(uint32_t table_id, const std::string &epoch,
                             const std::string &mode) override;
 
+  virtual std::future<int32_t> Save(
+      const LoadSaveContext &save_context) override;
+
   std::future<int32_t> clear() override;
 
   std::future<int32_t> clear(uint32_t table_id) override;
@@ -199,6 +204,10 @@ class BrpcPsClient : public PSClient {
                                                  const uint64_t *keys,
                                                  size_t num, bool is_training);
 
+  virtual std::future<int32_t> Pull(RequestContext &pull_context) override;
+
+  virtual std::future<int32_t> Push(RequestContext &push_context) override;
+
   virtual std::future<int32_t> print_table_stat(uint32_t table_id);
 
   virtual std::future<int32_t> barrier(size_t table_id, uint32_t barrier_type);
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.h b/paddle/fluid/distributed/ps/service/brpc_ps_server.h
index 4310c247438ce..d81a3a5df07f1 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_server.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.h
@@ -51,7 +51,7 @@ class BrpcPsServer : public PSServer {
     _server.Join();
     return 0;
   }
-  virtual int32_t port();
+  int32_t port();
 
  private:
   virtual int32_t initialize();
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
index 301708f6b7bb3..a3db88e3b679d 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
@@ -44,7 +44,7 @@ void GraphPsService_Stub::service(
   }
 }
 
-int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
+int GraphBrpcClient::get_server_index_by_id(int64_t id) {
   int shard_num = get_shard_num();
   int shard_per_server = shard_num % server_size == 0
                              ? shard_num / server_size
@@ -53,7 +53,7 @@ int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
 }
 
 std::future<int32_t> GraphBrpcClient::get_node_feat(
-    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const uint32_t &table_id, const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     std::vector<std::vector<std::string>> &res) {
   std::vector<int> request2server;
@@ -66,7 +66,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
@@ -129,7 +129,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     std::string joint_feature_name =
         paddle::string::join_strings(feature_names, '\t');
     closure->request(request_idx)
@@ -179,9 +179,9 @@ std::future<int32_t> GraphBrpcClient::clear_nodes(uint32_t table_id) {
   return fut;
 }
 std::future<int32_t> GraphBrpcClient::add_graph_node(
-    uint32_t table_id, std::vector<uint64_t> &node_id_list,
+    uint32_t table_id, std::vector<int64_t> &node_id_list,
     std::vector<bool> &is_weighted_list) {
-  std::vector<std::vector<uint64_t>> request_bucket;
+  std::vector<std::vector<int64_t>> request_bucket;
   std::vector<std::vector<bool>> is_weighted_bucket;
   bool add_weight = is_weighted_list.size() > 0;
   std::vector<int> server_index_arr;
@@ -191,7 +191,7 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
     if (index_mapping[server_index] == -1) {
       index_mapping[server_index] = request_bucket.size();
       server_index_arr.push_back(server_index);
-      request_bucket.push_back(std::vector<uint64_t>());
+      request_bucket.push_back(std::vector<int64_t>());
       if (add_weight) is_weighted_bucket.push_back(std::vector<bool>());
     }
     request_bucket[index_mapping[server_index]].push_back(
@@ -229,7 +229,7 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
     size_t node_num = request_bucket[request_idx].size();
     closure->request(request_idx)
         ->add_params((char *)request_bucket[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     if (add_weight) {
       bool weighted[is_weighted_bucket[request_idx].size() + 1];
       for (size_t j = 0; j < is_weighted_bucket[request_idx].size(); j++)
@@ -248,8 +248,8 @@ std::future<int32_t> GraphBrpcClient::add_graph_node(
   return fut;
 }
 std::future<int32_t> GraphBrpcClient::remove_graph_node(
-    uint32_t table_id, std::vector<uint64_t> &node_id_list) {
-  std::vector<std::vector<uint64_t>> request_bucket;
+    uint32_t table_id, std::vector<int64_t> &node_id_list) {
+  std::vector<std::vector<int64_t>> request_bucket;
   std::vector<int> server_index_arr;
   std::vector<int> index_mapping(server_size, -1);
   for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) {
@@ -257,7 +257,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
     if (index_mapping[server_index] == -1) {
       index_mapping[server_index] = request_bucket.size();
       server_index_arr.push_back(server_index);
-      request_bucket.push_back(std::vector<uint64_t>());
+      request_bucket.push_back(std::vector<int64_t>());
     }
     request_bucket[index_mapping[server_index]].push_back(
         node_id_list[query_idx]);
@@ -291,7 +291,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
 
     closure->request(request_idx)
         ->add_params((char *)request_bucket[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     // PsService_Stub rpc_stub(get_cmd_channel(server_index));
     GraphPsService_Stub rpc_stub =
         getServiceStub(get_cmd_channel(server_index));
@@ -303,9 +303,9 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
 }
 // char* &buffer,int &actual_size
 std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
-    uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
-    // std::vector<std::vector<std::pair<uint64_t, float>>> &res,
-    std::vector<std::vector<uint64_t>> &res,
+    uint32_t table_id, std::vector<int64_t> node_ids, int sample_size,
+    // std::vector<std::vector<std::pair<int64_t, float>>> &res,
+    std::vector<std::vector<int64_t>> &res,
     std::vector<std::vector<float>> &res_weight, bool need_weight,
     int server_index) {
   if (server_index != -1) {
@@ -337,7 +337,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
           int start = 0;
           while (start < actual_size) {
             res[node_idx].emplace_back(
-                *(uint64_t *)(node_buffer + offset + start));
+                *(int64_t *)(node_buffer + offset + start));
             start += GraphNode::id_size;
             if (need_weight) {
               res_weight[node_idx].emplace_back(
@@ -358,7 +358,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
     closure->request(0)->set_table_id(table_id);
     closure->request(0)->set_client_id(_client_id);
     closure->request(0)->add_params((char *)node_ids.data(),
-                                    sizeof(uint64_t) * node_ids.size());
+                                    sizeof(int64_t) * node_ids.size());
     closure->request(0)->add_params((char *)&sample_size, sizeof(int));
     closure->request(0)->add_params((char *)&need_weight, sizeof(bool));
     ;
@@ -380,14 +380,14 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
       server2request[server_index] = request2server.size();
       request2server.push_back(server_index);
     }
-    // res.push_back(std::vector<std::pair<uint64_t, float>>());
+    // res.push_back(std::vector<std::pair<int64_t, float>>());
     res.push_back({});
     if (need_weight) {
       res_weight.push_back({});
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
@@ -428,7 +428,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
               int start = 0;
               while (start < actual_size) {
                 res[query_idx].emplace_back(
-                    *(uint64_t *)(node_buffer + offset + start));
+                    *(int64_t *)(node_buffer + offset + start));
                 start += GraphNode::id_size;
                 if (need_weight) {
                   res_weight[query_idx].emplace_back(
@@ -459,7 +459,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     closure->request(request_idx)
         ->add_params((char *)&sample_size, sizeof(int));
     closure->request(request_idx)
@@ -476,7 +476,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
 }
 std::future<int32_t> GraphBrpcClient::random_sample_nodes(
     uint32_t table_id, int server_index, int sample_size,
-    std::vector<uint64_t> &ids) {
+    std::vector<int64_t> &ids) {
   DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
     int ret = 0;
     auto *closure = (DownpourBrpcClosure *)done;
@@ -490,7 +490,7 @@ std::future<int32_t> GraphBrpcClient::random_sample_nodes(
       auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
       int index = 0;
       while (index < bytes_size) {
-        ids.push_back(*(uint64_t *)(buffer + index));
+        ids.push_back(*(int64_t *)(buffer + index));
         index += GraphNode::id_size;
       }
       delete[] buffer;
@@ -633,7 +633,7 @@ std::future<int32_t> GraphBrpcClient::pull_graph_list(
 }
 
 std::future<int32_t> GraphBrpcClient::set_node_feat(
-    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const uint32_t &table_id, const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     const std::vector<std::vector<std::string>> &features) {
   std::vector<int> request2server;
@@ -646,7 +646,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
     }
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   std::vector<std::vector<std::vector<std::string>>> features_idx_buckets(
       request_call_num);
@@ -696,7 +696,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     std::string joint_feature_name =
         paddle::string::join_strings(feature_names, '\t');
     closure->request(request_idx)
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
index 06e753d028baa..e2b8a518615dc 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h
@@ -63,8 +63,8 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual ~GraphBrpcClient() {}
   // given a batch of nodes, sample graph_neighbors for each of them
   virtual std::future<int32_t> batch_sample_neighbors(
-      uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
-      std::vector<std::vector<uint64_t>>& res,
+      uint32_t table_id, std::vector<int64_t> node_ids, int sample_size,
+      std::vector<std::vector<int64_t>>& res,
       std::vector<std::vector<float>>& res_weight, bool need_weight,
       int server_index = -1);
 
@@ -75,20 +75,20 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual std::future<int32_t> random_sample_nodes(uint32_t table_id,
                                                    int server_index,
                                                    int sample_size,
-                                                   std::vector<uint64_t>& ids);
+                                                   std::vector<int64_t>& ids);
   virtual std::future<int32_t> get_node_feat(
-      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const uint32_t& table_id, const std::vector<int64_t>& node_ids,
       const std::vector<std::string>& feature_names,
       std::vector<std::vector<std::string>>& res);
 
   virtual std::future<int32_t> set_node_feat(
-      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const uint32_t& table_id, const std::vector<int64_t>& node_ids,
       const std::vector<std::string>& feature_names,
       const std::vector<std::vector<std::string>>& features);
 
   virtual std::future<int32_t> clear_nodes(uint32_t table_id);
   virtual std::future<int32_t> add_graph_node(
-      uint32_t table_id, std::vector<uint64_t>& node_id_list,
+      uint32_t table_id, std::vector<int64_t>& node_id_list,
       std::vector<bool>& is_weighted_list);
   virtual std::future<int32_t> use_neighbors_sample_cache(uint32_t table_id,
                                                           size_t size_limit,
@@ -96,11 +96,11 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual std::future<int32_t> load_graph_split_config(uint32_t table_id,
                                                        std::string path);
   virtual std::future<int32_t> remove_graph_node(
-      uint32_t table_id, std::vector<uint64_t>& node_id_list);
+      uint32_t table_id, std::vector<int64_t>& node_id_list);
   virtual int32_t initialize();
   int get_shard_num() { return shard_num; }
   void set_shard_num(int shard_num) { this->shard_num = shard_num; }
-  int get_server_index_by_id(uint64_t id);
+  int get_server_index_by_id(int64_t id);
   void set_local_channel(int index) {
     this->local_channel = get_cmd_channel(index);
   }
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index 441f489fb3097..20a55e4d11983 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -140,9 +140,9 @@ int32_t GraphBrpcService::add_graph_node(Table *table,
     return 0;
   }
 
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
   std::vector<bool> is_weighted_list;
   if (request.params_size() == 2) {
     size_t weight_list_size = request.params(1).size() / sizeof(bool);
@@ -165,9 +165,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table,
         "graph_get_node_feat request requires at least 1 argument");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   ((GraphTable *)table)->remove_graph_node(node_ids);
   return 0;
@@ -386,9 +386,9 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
         "graph_random_sample_neighbors request requires at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  int sample_size = *(uint64_t *)(request.params(1).c_str());
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  int sample_size = *(int64_t *)(request.params(1).c_str());
   bool need_weight = *(bool *)(request.params(2).c_str());
   std::vector<std::shared_ptr<char>> buffers(node_num);
   std::vector<int> actual_sizes(node_num, 0);
@@ -407,7 +407,7 @@ int32_t GraphBrpcService::graph_random_sample_neighbors(
 int32_t GraphBrpcService::graph_random_sample_nodes(
     Table *table, const PsRequestMessage &request, PsResponseMessage &response,
     brpc::Controller *cntl) {
-  size_t size = *(uint64_t *)(request.params(0).c_str());
+  size_t size = *(int64_t *)(request.params(0).c_str());
   std::unique_ptr<char[]> buffer;
   int actual_size;
   if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) ==
@@ -430,9 +430,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table,
         "graph_get_node_feat request requires at least 2 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   std::vector<std::string> feature_names =
       paddle::string::split_string<std::string>(request.params(1), "\t");
@@ -464,16 +464,16 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
                       "at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t),
+  size_t node_num = request.params(0).size() / sizeof(int64_t),
          size_of_size_t = sizeof(size_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  int sample_size = *(uint64_t *)(request.params(1).c_str());
-  bool need_weight = *(uint64_t *)(request.params(2).c_str());
-  // std::vector<uint64_t> res = ((GraphTable
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  int sample_size = *(int64_t *)(request.params(1).c_str());
+  bool need_weight = *(int64_t *)(request.params(2).c_str());
+  // std::vector<int64_t> res = ((GraphTable
   // *)table).filter_out_non_exist_nodes(node_data, sample_size);
   std::vector<int> request2server;
   std::vector<int> server2request(server_size, -1);
-  std::vector<uint64_t> local_id;
+  std::vector<int64_t> local_id;
   std::vector<int> local_query_idx;
   size_t rank = get_rank();
   for (int query_idx = 0; query_idx < node_num; ++query_idx) {
@@ -496,7 +496,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
   std::vector<std::shared_ptr<char>> local_buffers;
   std::vector<int> local_actual_sizes;
   std::vector<size_t> seq;
-  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   for (int query_idx = 0; query_idx < node_num; ++query_idx) {
     int server_index =
@@ -583,7 +583,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
 
     closure->request(request_idx)
         ->add_params((char *)node_id_buckets[request_idx].data(),
-                     sizeof(uint64_t) * node_num);
+                     sizeof(int64_t) * node_num);
     closure->request(request_idx)
         ->add_params((char *)&sample_size, sizeof(int));
     closure->request(request_idx)
@@ -618,9 +618,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table,
         "graph_set_node_feat request requires at least 3 arguments");
     return 0;
   }
-  size_t node_num = request.params(0).size() / sizeof(uint64_t);
-  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
-  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+  size_t node_num = request.params(0).size() / sizeof(int64_t);
+  int64_t *node_data = (int64_t *)(request.params(0).c_str());
+  std::vector<int64_t> node_ids(node_data, node_data + node_num);
 
   std::vector<std::string> feature_names =
       paddle::string::split_string<std::string>(request.params(1), "\t");
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.h b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
index aee0190850753..a978d97b296b0 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.h
@@ -43,7 +43,7 @@ class GraphBrpcServer : public PSServer {
     _server.Join();
     return 0;
   }
-  virtual int32_t port();
+  int32_t port();
 
   std::condition_variable *export_cv() { return &cv_; }
 
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 21719fbdbf1d6..8a2bfbe31602b 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -26,6 +26,7 @@
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/ps/table/table.h"
 #include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
@@ -59,6 +60,41 @@ class PSClientClosure : public google::protobuf::Closure {
   std::vector<std::shared_ptr<std::promise<int32_t>>> _promises;
 };
 
+struct LoadSaveContext {
+  int table_id;
+  std::string epoch;
+  std::string mode;
+};
+
+enum TrainingMode { Async = 0, Sync = 1, Geo = 3 };
+
+enum TrainingPhase { Init = 0, Train = 1, Save = 2 };
+
+// enum ValueType {
+//   Sparse = 0,
+//   Dense = 1
+// };
+
+struct PushContext {
+  const uint64_t *keys;
+  const float **push_values;
+  const Region *push_dense_values;
+};
+
+struct RequestContext {
+  int table;
+  TrainingMode training_mode;    // 1 for async, 2 for geo, 3 for sync
+  TrainingPhase training_phase;  // 1 for init, 2 for train
+  ValueType value_type;          // 1 for sparse, 2 for dense
+  void *keys;
+  void **sparse_values;  // for sparse values
+  Region *dense_values;  // for dense values
+  PushContext push_context;
+  size_t num;
+  bool is_training;
+  void *callback;
+};
+
 class PSClient {
  public:
   PSClient() {}
@@ -86,6 +122,9 @@ class PSClient {
   // 指定table数据load
   virtual std::future<int32_t> load(uint32_t table_id, const std::string &epoch,
                                     const std::string &mode) = 0;
+  // context配置load选项
+  virtual std::future<int32_t> Load(const LoadSaveContext &load_context) = 0;
+
   // 全量table数据save  value_accessor根据mode，可能有不同的save条件
   virtual std::future<int32_t> save(const std::string &epoch,
                                     const std::string &mode) = 0;
@@ -93,6 +132,8 @@ class PSClient {
   virtual std::future<int32_t> save(uint32_t table_id, const std::string &epoch,
                                     const std::string &mode) = 0;
 
+  virtual std::future<int32_t> Save(const LoadSaveContext &save_context) = 0;
+
   // 清空table数据
   virtual std::future<int32_t> clear() = 0;
   virtual std::future<int32_t> clear(uint32_t table_id) = 0;
@@ -107,6 +148,8 @@ class PSClient {
   virtual std::future<int32_t> pull_dense(Region *regions, size_t region_num,
                                           size_t table_id) = 0;  // 保留
 
+  virtual std::future<int32_t> Push(RequestContext &push_context) = 0;
+
   // firstly push dense param for parameter server
   // this is neccessary because dense weight initialized in trainer on cold
   // start
@@ -117,6 +160,9 @@ class PSClient {
   virtual std::future<int32_t> push_dense(const Region *regions,
                                           size_t region_num,
                                           size_t table_id) = 0;
+
+  virtual std::future<int32_t> Pull(RequestContext &pull_context) = 0;
+
   // 使用keys进行pull请求，结果填充values
   // keys和values的个数均为num个，每个value占用select_size空间
   // future结束前keys和values缓冲区不能再次使用
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc
index 972cce135f189..9e364b6d3ed7a 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc
@@ -56,6 +56,19 @@ ::std::future<int32_t> PsLocalClient::load(uint32_t table_id,
   return done();
 }
 
+std::future<int32_t> PsLocalClient::Load(const LoadSaveContext& load_context) {
+  if (load_context.table_id < 0) {
+    for (auto& it : _table_map) {
+      load(it.first, load_context.epoch, load_context.mode);
+    }
+    return done();
+  } else {
+    auto* table_ptr = table(load_context.table_id);
+    table_ptr->load(load_context.epoch, load_context.mode);
+    return done();
+  }
+}
+
 ::std::future<int32_t> PsLocalClient::save(const std::string& epoch,
                                            const std::string& mode) {
   // TODO
@@ -74,6 +87,21 @@ ::std::future<int32_t> PsLocalClient::save(uint32_t table_id,
   return done();
 }
 
+::std::future<int32_t> PsLocalClient::Save(
+    const LoadSaveContext& save_context) {
+  if (save_context.table_id < 0) {
+    for (auto& it : _table_map) {
+      save(it.first, save_context.epoch, save_context.mode);
+    }
+    return done();
+  } else {
+    auto* table_ptr = table(save_context.table_id);
+    table_ptr->flush();
+    table_ptr->save(save_context.epoch, save_context.mode);
+    return done();
+  }
+}
+
 ::std::future<int32_t> PsLocalClient::clear() {
   // TODO
   return done();
@@ -93,6 +121,51 @@ ::std::future<int32_t> PsLocalClient::stop_server() {
   return done();
 }
 
+::std::future<int32_t> PsLocalClient::Pull(RequestContext& pull_context) {
+  if (pull_context.value_type == Dense) {  // pull dense
+    Region* dense_region = reinterpret_cast<Region*>(pull_context.dense_values);
+    pull_dense(dense_region, pull_context.num, pull_context.table);
+  } else {  // pull sparse
+    uint64_t* keys = reinterpret_cast<uint64_t*>(pull_context.keys);
+    char** select_values = reinterpret_cast<char**>(pull_context.sparse_values);
+    size_t table_id = pull_context.table;
+    size_t num = pull_context.num;
+    pull_sparse_ptr(select_values, table_id, keys, num);
+  }
+}
+
+::std::future<int32_t> PsLocalClient::Push(RequestContext& push_context) {
+  if (push_context.value_type == Dense) {  // push dense
+    if (push_context.training_phase == Init) {
+      const Region* regions = push_context.push_context.push_dense_values;
+      size_t region_num = push_context.num;
+      push_dense_param(regions, region_num, push_context.table);
+    } else {
+      if (push_context.training_mode == Geo) {  // geo
+        float* total_send_data =
+            reinterpret_cast<float*>(push_context.dense_values);
+        size_t total_send_data_size = push_context.num;
+        push_dense_raw_gradient(push_context.table, total_send_data,
+                                total_send_data_size, push_context.callback);
+      } else {  // async and sync
+        const Region* regions = push_context.push_context.push_dense_values;
+        size_t region_num = push_context.num;
+        push_dense(regions, region_num, push_context.table);
+      }
+    }
+  } else {  // push sparse
+    if (push_context.training_mode == Async) {
+      const uint64_t* keys = push_context.push_context.keys;
+      const float** update_values = push_context.push_context.push_values;
+      size_t table_id = push_context.table;
+      size_t num = push_context.num;
+      push_sparse(table_id, keys, update_values, num);
+    } else {
+      // TODO
+    }
+  }
+}
+
 ::std::future<int32_t> PsLocalClient::pull_dense(Region* regions,
                                                  size_t region_num,
                                                  size_t table_id) {
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h
index e73974ac56286..83ca558e3d2cb 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.h
@@ -39,12 +39,16 @@ class PsLocalClient : public PSClient {
   virtual ::std::future<int32_t> load(uint32_t table_id,
                                       const std::string& epoch,
                                       const std::string& mode) override;
+  virtual std::future<int32_t> Load(
+      const LoadSaveContext& load_context) override;
 
   virtual ::std::future<int32_t> save(const std::string& epoch,
                                       const std::string& mode) override;
   virtual ::std::future<int32_t> save(uint32_t table_id,
                                       const std::string& epoch,
                                       const std::string& mode) override;
+  virtual std::future<int32_t> Save(
+      const LoadSaveContext& save_context) override;
 
   virtual ::std::future<int32_t> clear() override;
   virtual ::std::future<int32_t> clear(uint32_t table_id) override;
@@ -55,6 +59,10 @@ class PsLocalClient : public PSClient {
   virtual ::std::future<int32_t> pull_dense(Region* regions, size_t region_num,
                                             size_t table_id);
 
+  virtual ::std::future<int32_t> Pull(RequestContext& pull_context) override;
+
+  virtual ::std::future<int32_t> Push(RequestContext& push_context) override;
+
   virtual ::std::future<int32_t> push_dense(const Region* regions,
                                             size_t region_num, size_t table_id);
 
diff --git a/paddle/fluid/distributed/ps/service/ps_local_server.h b/paddle/fluid/distributed/ps/service/ps_local_server.h
index 91f8bc4c91271..31b52126fc576 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_server.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_server.h
@@ -28,7 +28,6 @@ class PsLocalServer : public PSServer {
   virtual uint64_t start() { return 0; }
   virtual uint64_t start(const std::string &ip, uint32_t port) { return 0; }
   virtual int32_t stop() { return 0; }
-  virtual int32_t port() { return 0; }
   virtual int32_t configure(
       const PSParameter &config, PSEnvironment &env, size_t server_rank,
       const std::vector<framework::ProgramDesc> &server_sub_program = {}) {
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
index 088edcb75bbc6..c8be0f7971090 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
@@ -44,9 +44,9 @@ void GraphPyService::add_table_feat_conf(std::string table_name,
   }
 }
 
-void add_graph_node(std::vector<uint64_t> node_ids,
+void add_graph_node(std::vector<int64_t> node_ids,
                     std::vector<bool> weight_list) {}
-void remove_graph_node(std::vector<uint64_t> node_ids) {}
+void remove_graph_node(std::vector<int64_t> node_ids) {}
 void GraphPyService::set_up(std::string ips_str, int shard_num,
                             std::vector<std::string> node_types,
                             std::vector<std::string> edge_types) {
@@ -260,7 +260,7 @@ void GraphPyClient::clear_nodes(std::string name) {
 }
 
 void GraphPyClient::add_graph_node(std::string name,
-                                   std::vector<uint64_t>& node_ids,
+                                   std::vector<int64_t>& node_ids,
                                    std::vector<bool>& weight_list) {
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
@@ -271,7 +271,7 @@ void GraphPyClient::add_graph_node(std::string name,
 }
 
 void GraphPyClient::remove_graph_node(std::string name,
-                                      std::vector<uint64_t>& node_ids) {
+                                      std::vector<int64_t>& node_ids) {
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
     auto status = get_ps_client()->remove_graph_node(table_id, node_ids);
@@ -290,13 +290,12 @@ void GraphPyClient::load_node_file(std::string name, std::string filepath) {
   }
 }
 
-std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>>
+std::pair<std::vector<std::vector<int64_t>>, std::vector<float>>
 GraphPyClient::batch_sample_neighbors(std::string name,
-                                      std::vector<uint64_t> node_ids,
+                                      std::vector<int64_t> node_ids,
                                       int sample_size, bool return_weight,
                                       bool return_edges) {
-  // std::vector<std::vector<std::pair<uint64_t, float>>> v;
-  std::vector<std::vector<uint64_t>> v;
+  std::vector<std::vector<int64_t>> v;
   std::vector<std::vector<float>> v1;
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
@@ -309,7 +308,7 @@ GraphPyClient::batch_sample_neighbors(std::string name,
   // res.first[1]: slice index
   // res.first[2]: src nodes
   // res.second: edges weight
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>> res;
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>> res;
   res.first.push_back({});
   res.first.push_back({});
   if (return_edges) res.first.push_back({});
@@ -342,10 +341,10 @@ void GraphPyClient::use_neighbors_sample_cache(std::string name,
     status.wait();
   }
 }
-std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
-                                                         int server_index,
-                                                         int sample_size) {
-  std::vector<uint64_t> v;
+std::vector<int64_t> GraphPyClient::random_sample_nodes(std::string name,
+                                                        int server_index,
+                                                        int sample_size) {
+  std::vector<int64_t> v;
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
     auto status =
@@ -357,7 +356,7 @@ std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
 
 // (name, dtype, ndarray)
 std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
-    std::string node_type, std::vector<uint64_t> node_ids,
+    std::string node_type, std::vector<int64_t> node_ids,
     std::vector<std::string> feature_names) {
   std::vector<std::vector<std::string>> v(
       feature_names.size(), std::vector<std::string>(node_ids.size()));
@@ -371,7 +370,7 @@ std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
 }
 
 void GraphPyClient::set_node_feat(
-    std::string node_type, std::vector<uint64_t> node_ids,
+    std::string node_type, std::vector<int64_t> node_ids,
     std::vector<std::string> feature_names,
     const std::vector<std::vector<std::string>> features) {
   if (this->table_id_map.count(node_type)) {
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index c25ef5035453d..85707137c1800 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -70,18 +70,34 @@ class GraphPyService {
     ::paddle::distributed::TableAccessorParameter* accessor_proto =
         sparse_table_proto->mutable_accessor();
 
-    ::paddle::distributed::CommonAccessorParameter* common_proto =
-        sparse_table_proto->mutable_common();
+    // ::paddle::distributed::CommonAccessorParameter* common_proto =
+    //     sparse_table_proto->mutable_common();
 
+    ::paddle::distributed::GraphParameter* graph_proto =
+        sparse_table_proto->mutable_graph_parameter();
+
+    ::paddle::distributed::GraphFeature* graph_feature =
+        graph_proto->mutable_graph_feature();
+
+    graph_proto->set_task_pool_size(24);
+
+    graph_proto->set_table_name(table_name);
+    graph_proto->set_table_type(table_type);
+    graph_proto->set_use_cache(false);
     // Set GraphTable Parameter
-    common_proto->set_table_name(table_name);
-    common_proto->set_name(table_type);
+    // common_proto->set_table_name(table_name);
+    // common_proto->set_name(table_type);
+    // for (size_t i = 0; i < feat_name.size(); i++) {
+    //   common_proto->add_params(feat_dtype[i]);
+    //   common_proto->add_dims(feat_shape[i]);
+    //   common_proto->add_attributes(feat_name[i]);
+    // }
+
     for (size_t i = 0; i < feat_name.size(); i++) {
-      common_proto->add_params(feat_dtype[i]);
-      common_proto->add_dims(feat_shape[i]);
-      common_proto->add_attributes(feat_name[i]);
+      graph_feature->add_dtype(feat_dtype[i]);
+      graph_feature->add_shape(feat_shape[i]);
+      graph_feature->add_name(feat_name[i]);
     }
-
     accessor_proto->set_accessor_class("CommMergeAccessor");
   }
 
@@ -143,24 +159,24 @@ class GraphPyClient : public GraphPyService {
   void load_edge_file(std::string name, std::string filepath, bool reverse);
   void load_node_file(std::string name, std::string filepath);
   void clear_nodes(std::string name);
-  void add_graph_node(std::string name, std::vector<uint64_t>& node_ids,
+  void add_graph_node(std::string name, std::vector<int64_t>& node_ids,
                       std::vector<bool>& weight_list);
-  void remove_graph_node(std::string name, std::vector<uint64_t>& node_ids);
+  void remove_graph_node(std::string name, std::vector<int64_t>& node_ids);
   int get_client_id() { return client_id; }
   void set_client_id(int client_id) { this->client_id = client_id; }
   void start_client();
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>>
-  batch_sample_neighbors(std::string name, std::vector<uint64_t> node_ids,
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>>
+  batch_sample_neighbors(std::string name, std::vector<int64_t> node_ids,
                          int sample_size, bool return_weight,
                          bool return_edges);
-  std::vector<uint64_t> random_sample_nodes(std::string name, int server_index,
-                                            int sample_size);
+  std::vector<int64_t> random_sample_nodes(std::string name, int server_index,
+                                           int sample_size);
   std::vector<std::vector<std::string>> get_node_feat(
-      std::string node_type, std::vector<uint64_t> node_ids,
+      std::string node_type, std::vector<int64_t> node_ids,
       std::vector<std::string> feature_names);
   void use_neighbors_sample_cache(std::string name, size_t total_size_limit,
                                   size_t ttl);
-  void set_node_feat(std::string node_type, std::vector<uint64_t> node_ids,
+  void set_node_feat(std::string node_type, std::vector<int64_t> node_ids,
                      std::vector<std::string> feature_names,
                      const std::vector<std::vector<std::string>> features);
   std::vector<FeatureNode> pull_graph_list(std::string name, int server_index,
diff --git a/paddle/fluid/distributed/ps/service/server.cc b/paddle/fluid/distributed/ps/service/server.cc
index 5f1974e3e610c..893f671359e40 100644
--- a/paddle/fluid/distributed/ps/service/server.cc
+++ b/paddle/fluid/distributed/ps/service/server.cc
@@ -67,8 +67,6 @@ int32_t PSServer::configure(
   _config = config.server_param();
   _rank = server_rank;
   _environment = &env;
-  _shuffled_ins =
-      paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>();
   size_t shard_num = env.get_ps_servers().size();
 
   const auto &downpour_param = _config.downpour_server_param();
diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h
index 160d4a6128295..d2804405b4198 100644
--- a/paddle/fluid/distributed/ps/service/server.h
+++ b/paddle/fluid/distributed/ps/service/server.h
@@ -69,11 +69,6 @@ class PSServer {
       const PSParameter &config, PSEnvironment &env, size_t server_rank,
       const std::vector<framework::ProgramDesc> &server_sub_program = {});
 
-  // return server_ip
-  virtual std::string ip() { return butil::my_ip_cstr(); }
-  // return server_port
-  virtual int32_t port() = 0;
-
   virtual uint64_t start(const std::string &ip, uint32_t port) = 0;
   virtual int32_t stop() = 0;
 
@@ -94,15 +89,6 @@ class PSServer {
     return &_table_map;
   }
 
-  typedef std::function<int32_t(int, int, const std::string &)> MsgHandlerFunc;
-  virtual int registe_pserver2pserver_msg_handler(int msg_type,
-                                                  MsgHandlerFunc handler) {
-    _msg_handler_map[msg_type] = handler;
-    return 0;
-  }
-
-  paddle::framework::Channel<std::pair<uint64_t, std::string>> _shuffled_ins;
-
  protected:
   virtual int32_t initialize() = 0;
 
@@ -111,7 +97,6 @@ class PSServer {
   ServerParameter _config;
   PSEnvironment *_environment;
   std::unordered_map<uint32_t, std::shared_ptr<Table>> _table_map;
-  std::unordered_map<int32_t, MsgHandlerFunc> _msg_handler_map;
 
  protected:
   std::shared_ptr<framework::Scope> scope_;
diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index be916bf2e8003..2fa5ecb4051c5 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -53,7 +53,6 @@ cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_pro
 
 set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table)
-
 cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
 
 target_link_libraries(table -fopenmp)
diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h
index 7c91a60864980..07c211bb9c128 100644
--- a/paddle/fluid/distributed/ps/table/accessor.h
+++ b/paddle/fluid/distributed/ps/table/accessor.h
@@ -45,6 +45,17 @@ struct DataConverter {
   std::string deconverter;
 };
 
+struct AccessorInfo {
+  size_t dim;
+  size_t size;
+  size_t select_size;
+  size_t select_dim;
+  size_t update_size;
+  size_t update_dim;
+  size_t mf_size;
+  size_t fea_dim;
+};
+
 class ValueAccessor {
  public:
   ValueAccessor() {}
@@ -68,6 +79,8 @@ class ValueAccessor {
   }
   virtual int initialize() = 0;
 
+  virtual void GetTableInfo(AccessorInfo& info) = 0;
+
   // value维度
   virtual size_t dim() = 0;
   // value各个维度的size
@@ -163,6 +176,7 @@ class ValueAccessor {
   TableAccessorParameter _config;
   std::unordered_map<int, std::shared_ptr<struct DataConverter>>
       _data_coverter_map;
+  AccessorInfo _accessor_info;
 };
 REGISTER_PSCORE_REGISTERER(ValueAccessor);
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/ps/table/common_dense_table.cc b/paddle/fluid/distributed/ps/table/common_dense_table.cc
index 607469e2f7b0d..cc0f5867a3d65 100644
--- a/paddle/fluid/distributed/ps/table/common_dense_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_dense_table.cc
@@ -128,6 +128,21 @@ int32_t CommonDenseTable::set_global_lr(float* lr) {
   return 0;
 }
 
+int32_t CommonDenseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Dense);
+  float* pull_values = context.pull_context.values;
+  return pull_dense(pull_values, context.num);
+}
+
+int32_t CommonDenseTable::Push(TableContext& context) {
+  CHECK(context.value_type == Dense);
+  if (context.pull_context.values != nullptr) {
+    const float* values = context.push_context.values;
+    return push_dense(values, context.num);
+  }
+  return 0;
+}
+
 int32_t CommonDenseTable::pull_dense(float* pull_values, size_t num) {
   std::copy(values_[param_idx_].begin(), values_[param_idx_].end(),
             pull_values);
diff --git a/paddle/fluid/distributed/ps/table/common_dense_table.h b/paddle/fluid/distributed/ps/table/common_dense_table.h
index a4c0f29ddb877..cad49a0a449c4 100644
--- a/paddle/fluid/distributed/ps/table/common_dense_table.h
+++ b/paddle/fluid/distributed/ps/table/common_dense_table.h
@@ -40,6 +40,8 @@ class CommonDenseTable : public DenseTable {
                                   const std::string& name);
   virtual int32_t initialize_value();
   virtual int32_t initialize_optimizer();
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
   int32_t pull_dense(float* pull_values, size_t num) override;
   int32_t push_dense_param(const float* values, size_t num) override;
   int32_t push_dense(const float* values, size_t num) override;
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index 54b98cb96ce51..2c07bd65d63d4 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -27,6 +27,288 @@
 namespace paddle {
 namespace distributed {
 
+#ifdef PADDLE_WITH_HETERPS
+
+int CompleteGraphSampler::run_graph_sampling() {
+  pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
+  pthread_rwlock_rdlock(rw_lock);
+  std::cout << "in graph sampling" << std::endl;
+  sample_nodes.clear();
+  sample_neighbors.clear();
+  sample_res.clear();
+  sample_nodes.resize(gpu_num);
+  sample_neighbors.resize(gpu_num);
+  sample_res.resize(gpu_num);
+  std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+      sample_nodes_ex(graph_table->task_pool_size_);
+  std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+      graph_table->task_pool_size_);
+  for (int i = 0; i < graph_table->task_pool_size_; i++) {
+    sample_nodes_ex[i].resize(gpu_num);
+    sample_neighbors_ex[i].resize(gpu_num);
+  }
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < graph_table->shards.size(); ++i) {
+    tasks.push_back(
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue([&, i, this]() -> int {
+              if (this->status == GraphSamplerStatus::terminating) return 0;
+              paddle::framework::GpuPsGraphNode node;
+              std::vector<Node *> &v =
+                  this->graph_table->shards[i]->get_bucket();
+              size_t ind = i % this->graph_table->task_pool_size_;
+              for (size_t j = 0; j < v.size(); j++) {
+                size_t location = v[j]->get_id() % this->gpu_num;
+                node.node_id = v[j]->get_id();
+                node.neighbor_size = v[j]->get_neighbor_size();
+                node.neighbor_offset =
+                    (int)sample_neighbors_ex[ind][location].size();
+                sample_nodes_ex[ind][location].emplace_back(node);
+                for (int k = 0; k < node.neighbor_size; k++)
+                  sample_neighbors_ex[ind][location].push_back(
+                      v[j]->get_neighbor_id(k));
+              }
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  tasks.clear();
+  for (size_t i = 0; i < gpu_num; i++) {
+    tasks.push_back(
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue([&, i, this]() -> int {
+              if (this->status == GraphSamplerStatus::terminating) return 0;
+              int total_offset = 0;
+              size_t ind = i % this->graph_table->task_pool_size_;
+              for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
+                for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) {
+                  sample_nodes[ind].push_back(sample_nodes_ex[j][ind][k]);
+                  sample_nodes[ind].back().neighbor_offset += total_offset;
+                }
+                size_t neighbor_size = sample_neighbors_ex[j][ind].size();
+                total_offset += neighbor_size;
+                for (size_t k = 0; k < neighbor_size; k++) {
+                  sample_neighbors[ind].push_back(
+                      sample_neighbors_ex[j][ind][k]);
+                }
+              }
+              return 0;
+            }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+
+  if (this->status == GraphSamplerStatus::terminating) {
+    pthread_rwlock_unlock(rw_lock);
+    return 0;
+  }
+  for (size_t i = 0; i < gpu_num; i++) {
+    sample_res[i].node_list = sample_nodes[i].data();
+    sample_res[i].neighbor_list = sample_neighbors[i].data();
+    sample_res[i].node_size = sample_nodes[i].size();
+    sample_res[i].neighbor_size = sample_neighbors[i].size();
+  }
+  pthread_rwlock_unlock(rw_lock);
+  if (this->status == GraphSamplerStatus::terminating) {
+    return 0;
+  }
+  callback(sample_res);
+  return 0;
+}
+void CompleteGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
+                                std::vector<std::string> args) {
+  this->gpu_num = gpu_num;
+  this->graph_table = graph_table;
+}
+
+int BasicBfsGraphSampler::run_graph_sampling() {
+  pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
+  pthread_rwlock_rdlock(rw_lock);
+  while (rounds > 0 && status == GraphSamplerStatus::running) {
+    for (size_t i = 0; i < sample_neighbors_map.size(); i++) {
+      sample_neighbors_map[i].clear();
+    }
+    sample_neighbors_map.clear();
+    std::vector<int> nodes_left(graph_table->shards.size(),
+                                node_num_for_each_shard);
+    std::promise<int> prom;
+    std::future<int> fut = prom.get_future();
+    sample_neighbors_map.resize(graph_table->task_pool_size_);
+    int task_size = 0;
+    std::vector<std::future<int>> tasks;
+    int init_size = 0;
+    //__sync_fetch_and_add
+    std::function<int(int, int64_t)> bfs = [&, this](int i, int id) -> int {
+      VLOG(0) << "in bfs " << i << " " << id;
+      if (this->status == GraphSamplerStatus::terminating) {
+        int task_left = __sync_sub_and_fetch(&task_size, 1);
+        if (task_left == 0) {
+          prom.set_value(0);
+        }
+        return 0;
+      }
+      size_t ind = i % this->graph_table->task_pool_size_;
+      if (nodes_left[i] > 0) {
+        nodes_left[i]--;
+        auto iter = sample_neighbors_map[ind].find(id);
+        if (iter == sample_neighbors_map[ind].end()) {
+          sample_neighbors_map[ind][id] = std::vector<int64_t>();
+          iter = sample_neighbors_map[ind].find(id);
+          Node *node = graph_table->shards[i]->find_node(id);
+          if (node != NULL) {
+            size_t edge_fetch_size =
+                std::min((size_t) this->edge_num_for_each_node,
+                         node->get_neighbor_size());
+            for (size_t k = 0; k < edge_fetch_size; k++) {
+              int64_t neighbor_id = node->get_neighbor_id(k);
+              int node_location = neighbor_id % this->graph_table->shard_num %
+                                  this->graph_table->task_pool_size_;
+              __sync_add_and_fetch(&task_size, 1);
+              graph_table->_shards_task_pool[node_location]->enqueue(
+                  bfs, neighbor_id % this->graph_table->shard_num, neighbor_id);
+              iter->second.push_back(neighbor_id);
+            }
+          }
+        }
+      }
+      int task_left = __sync_sub_and_fetch(&task_size, 1);
+      if (task_left == 0) {
+        prom.set_value(0);
+      }
+      return 0;
+    };
+    for (size_t i = 0; i < graph_table->shards.size(); ++i) {
+      std::vector<Node *> &v = graph_table->shards[i]->get_bucket();
+      if (v.size() > 0) {
+        init_size++;
+        __sync_add_and_fetch(&task_size, 1);
+        int64_t id = v[0]->get_id();
+        graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+            ->enqueue(bfs, i, id);
+      }  // if
+    }
+    if (init_size == 0) {
+      prom.set_value(0);
+    }
+    fut.get();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    std::cout << "bfs over" << std::endl;
+    sample_nodes.clear();
+    sample_neighbors.clear();
+    sample_res.clear();
+    sample_nodes.resize(gpu_num);
+    sample_neighbors.resize(gpu_num);
+    sample_res.resize(gpu_num);
+    std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+        sample_nodes_ex(graph_table->task_pool_size_);
+    std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
+        graph_table->task_pool_size_);
+    for (int i = 0; i < graph_table->task_pool_size_; i++) {
+      sample_nodes_ex[i].resize(gpu_num);
+      sample_neighbors_ex[i].resize(gpu_num);
+    }
+    tasks.clear();
+    for (size_t i = 0; i < (size_t)graph_table->task_pool_size_; ++i) {
+      tasks.push_back(
+          graph_table->_shards_task_pool[i]->enqueue([&, i, this]() -> int {
+            if (this->status == GraphSamplerStatus::terminating) {
+              return 0;
+            }
+            paddle::framework::GpuPsGraphNode node;
+            auto iter = sample_neighbors_map[i].begin();
+            size_t ind = i;
+            for (; iter != sample_neighbors_map[i].end(); iter++) {
+              size_t location = iter->first % this->gpu_num;
+              node.node_id = iter->first;
+              node.neighbor_size = iter->second.size();
+              node.neighbor_offset =
+                  (int)sample_neighbors_ex[ind][location].size();
+              sample_nodes_ex[ind][location].emplace_back(node);
+              for (auto k : iter->second)
+                sample_neighbors_ex[ind][location].push_back(k);
+            }
+            return 0;
+          }));
+    }
+
+    for (size_t i = 0; i < tasks.size(); i++) {
+      tasks[i].get();
+      sample_neighbors_map[i].clear();
+    }
+    tasks.clear();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    for (size_t i = 0; i < gpu_num; i++) {
+      tasks.push_back(
+          graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
+              ->enqueue([&, i, this]() -> int {
+                if (this->status == GraphSamplerStatus::terminating) {
+                  pthread_rwlock_unlock(rw_lock);
+                  return 0;
+                }
+                int total_offset = 0;
+                size_t ind = i % graph_table->task_pool_size_;
+                for (int j = 0; j < this->graph_table->task_pool_size_; j++) {
+                  for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) {
+                    sample_nodes[i].push_back(sample_nodes_ex[j][ind][k]);
+                    sample_nodes[i].back().neighbor_offset += total_offset;
+                    // neighbor_offset[i].push_back(total_offset +
+                    // neighbor_offset_ex[j][i][k]);
+                  }
+                  size_t neighbor_size = sample_neighbors_ex[j][ind].size();
+                  total_offset += neighbor_size;
+                  for (size_t k = 0; k < neighbor_size; k++) {
+                    sample_neighbors[ind].push_back(
+                        sample_neighbors_ex[j][ind][k]);
+                  }
+                }
+                return 0;
+              }));
+    }
+    for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+    if (this->status == GraphSamplerStatus::terminating) {
+      pthread_rwlock_unlock(rw_lock);
+      return 0;
+    }
+    // int64_t total_neighbors =
+    // std::accumulate(shard_neighbor_size.begin(),shard_neighbor_size.end(),0);
+    for (size_t i = 0; i < gpu_num; i++) {
+      sample_res[i].node_list = sample_nodes[i].data();
+      sample_res[i].neighbor_list = sample_neighbors[i].data();
+      sample_res[i].node_size = sample_nodes[i].size();
+      sample_res[i].neighbor_size = sample_neighbors[i].size();
+    }
+    pthread_rwlock_unlock(rw_lock);
+    if (this->status == GraphSamplerStatus::terminating) {
+      return 0;
+    }
+    callback(sample_res);
+    rounds--;
+    if (rounds > 0) {
+      for (int i = 0;
+           i < interval && this->status == GraphSamplerStatus::running; i++) {
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      }
+    }
+  }
+  return 0;
+}
+void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table,
+                                std::vector<std::string> args) {
+  this->gpu_num = gpu_num;
+  this->graph_table = graph_table;
+  node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10;
+  edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10;
+  rounds = args.size() > 2 ? std::stoi(args[2]) : 1;
+  interval = args.size() > 3 ? std::stoi(args[3]) : 60;
+}
+
+#endif
+
 std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
   if (start < 0) start = 0;
   std::vector<Node *> res;
@@ -38,10 +320,10 @@ std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
 
 size_t GraphShard::get_size() { return bucket.size(); }
 
-int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
+int32_t GraphTable::add_graph_node(std::vector<int64_t> &id_list,
                                    std::vector<bool> &is_weight_list) {
   size_t node_size = id_list.size();
-  std::vector<std::vector<std::pair<uint64_t, bool>>> batch(task_pool_size_);
+  std::vector<std::vector<std::pair<int64_t, bool>>> batch(task_pool_size_);
   for (size_t i = 0; i < node_size; i++) {
     size_t shard_id = id_list[i] % shard_num;
     if (shard_id >= shard_end || shard_id < shard_start) {
@@ -65,9 +347,9 @@ int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
   return 0;
 }
 
-int32_t GraphTable::remove_graph_node(std::vector<uint64_t> &id_list) {
+int32_t GraphTable::remove_graph_node(std::vector<int64_t> &id_list) {
   size_t node_size = id_list.size();
-  std::vector<std::vector<uint64_t>> batch(task_pool_size_);
+  std::vector<std::vector<int64_t>> batch(task_pool_size_);
   for (size_t i = 0; i < node_size; i++) {
     size_t shard_id = id_list[i] % shard_num;
     if (shard_id >= shard_end || shard_id < shard_start) continue;
@@ -98,7 +380,7 @@ void GraphShard::clear() {
 
 GraphShard::~GraphShard() { clear(); }
 
-void GraphShard::delete_node(uint64_t id) {
+void GraphShard::delete_node(int64_t id) {
   auto iter = node_location.find(id);
   if (iter == node_location.end()) return;
   int pos = iter->second;
@@ -110,7 +392,7 @@ void GraphShard::delete_node(uint64_t id) {
   node_location.erase(id);
   bucket.pop_back();
 }
-GraphNode *GraphShard::add_graph_node(uint64_t id) {
+GraphNode *GraphShard::add_graph_node(int64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
     bucket.push_back(new GraphNode(id));
@@ -126,7 +408,7 @@ GraphNode *GraphShard::add_graph_node(Node *node) {
   }
   return (GraphNode *)bucket[node_location[id]];
 }
-FeatureNode *GraphShard::add_feature_node(uint64_t id) {
+FeatureNode *GraphShard::add_feature_node(int64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
     bucket.push_back(new FeatureNode(id));
@@ -134,11 +416,11 @@ FeatureNode *GraphShard::add_feature_node(uint64_t id) {
   return (FeatureNode *)bucket[node_location[id]];
 }
 
-void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) {
+void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) {
   find_node(id)->add_edge(dst_id, weight);
 }
 
-Node *GraphShard::find_node(uint64_t id) {
+Node *GraphShard::find_node(int64_t id) {
   auto iter = node_location.find(id);
   return iter == node_location.end() ? nullptr : bucket[iter->second];
 }
@@ -185,14 +467,14 @@ int32_t GraphTable::load(const std::string &path, const std::string &param) {
 }
 
 int32_t GraphTable::get_nodes_ids_by_ranges(
-    std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res) {
+    std::vector<std::pair<int, int>> ranges, std::vector<int64_t> &res) {
   int start = 0, end, index = 0, total_size = 0;
   res.clear();
-  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  std::vector<std::future<std::vector<int64_t>>> tasks;
   for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
     end = total_size + shards[i]->get_size();
     start = total_size;
-    while (start < end && index < ranges.size()) {
+    while (start < end && index < (int)ranges.size()) {
       if (ranges[index].second <= start)
         index++;
       else if (ranges[index].first >= end) {
@@ -204,7 +486,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
         first -= total_size;
         second -= total_size;
         tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
-            [this, first, second, i]() -> std::vector<uint64_t> {
+            [this, first, second, i]() -> std::vector<int64_t> {
               return shards[i]->get_ids_by_range(first, second);
             }));
       }
@@ -276,6 +558,9 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
 }
 
 int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
+#ifdef PADDLE_WITH_HETERPS
+  if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
+#endif
   auto paths = paddle::string::split_string<std::string>(path, ";");
   int64_t count = 0;
   std::string sample_type = "random";
@@ -351,6 +636,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   /*-----------------------
   relocate the duplicate nodes to make them distributed evenly among threads.
 */
+  if (!use_duplicate_nodes) {
+#ifdef PADDLE_WITH_HETERPS
+    if (gpups_mode) pthread_rwlock_unlock(rw_lock.get());
+#endif
+
+    return 0;
+  }
   for (auto &shard : extra_shards) {
     auto bucket = shard->get_bucket();
     for (size_t i = 0; i < bucket.size(); i++) {
@@ -360,13 +652,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   int size = extra_nodes_to_thread_index.size();
   if (size == 0) return 0;
   std::vector<int> index;
-  for (int i = 0; i < used.size(); i++) index.push_back(i);
+  for (int i = 0; i < (int)used.size(); i++) index.push_back(i);
   sort(index.begin(), index.end(),
        [&](int &a, int &b) { return used[a] < used[b]; });
 
   std::vector<int> alloc(index.size(), 0), has_alloc(index.size(), 0);
   int t = 1, aim = 0, mod = 0;
-  for (; t < used.size(); t++) {
+  for (; t < (int)used.size(); t++) {
     if ((used[index[t]] - used[index[t - 1]]) * t >= size) {
       break;
     } else {
@@ -380,7 +672,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
     if (t - x <= mod) alloc[index[x]]++;
     alloc[index[x]] -= used[index[x]];
   }
-  std::vector<uint64_t> vec[index.size()];
+  std::vector<int64_t> vec[index.size()];
   for (auto p : extra_nodes_to_thread_index) {
     has_alloc[p.second]++;
     vec[p.second].push_back(p.first);
@@ -395,7 +687,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
                      has_alloc[index[right]] - alloc[index[right]]);
     has_alloc[index[left]] += x;
     has_alloc[index[right]] -= x;
-    uint64_t id;
+    int64_t id;
     while (x--) {
       id = vec[index[right]].back();
       vec[index[right]].pop_back();
@@ -424,10 +716,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
     delete extra_shards[i];
     extra_shards[i] = extra_shards_copy[i];
   }
+#ifdef PADDLE_WITH_HETERPS
+  if (gpups_mode) pthread_rwlock_unlock(rw_lock.get());
+#endif
   return 0;
 }
 
-Node *GraphTable::find_node(uint64_t id) {
+Node *GraphTable::find_node(int64_t id) {
   size_t shard_id = id % shard_num;
   if (shard_id >= shard_end || shard_id < shard_start) {
     if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
@@ -443,7 +738,7 @@ Node *GraphTable::find_node(uint64_t id) {
   Node *node = shards[index]->find_node(id);
   return node;
 }
-uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
+uint32_t GraphTable::get_thread_pool_index(int64_t node_id) {
   if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
     return node_id % shard_num % shard_num_per_server % task_pool_size_;
   size_t src_shard_id = node_id % shard_num;
@@ -456,8 +751,7 @@ uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
   return src_shard_id % shard_num_per_server % task_pool_size_;
 }
 
-uint32_t GraphTable::get_thread_pool_index_by_shard_index(
-    uint64_t shard_index) {
+uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) {
   return shard_index % shard_num_per_server % task_pool_size_;
 }
 
@@ -484,7 +778,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
                                         std::unique_ptr<char[]> &buffer,
                                         int &actual_size) {
   int total_size = 0;
-  for (int i = 0; i < shards.size(); i++) {
+  for (int i = 0; i < (int)shards.size(); i++) {
     total_size += shards[i]->get_size();
   }
   if (sample_size > total_size) sample_size = total_size;
@@ -537,16 +831,16 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
     }
   }
   for (auto &pair : first_half) second_half.push_back(pair);
-  std::vector<uint64_t> res;
+  std::vector<int64_t> res;
   get_nodes_ids_by_ranges(second_half, res);
-  actual_size = res.size() * sizeof(uint64_t);
+  actual_size = res.size() * sizeof(int64_t);
   buffer.reset(new char[actual_size]);
   char *pointer = buffer.get();
   memcpy(pointer, res.data(), actual_size);
   return 0;
 }
 int32_t GraphTable::random_sample_neighbors(
-    uint64_t *node_ids, int sample_size,
+    int64_t *node_ids, int sample_size,
     std::vector<std::shared_ptr<char>> &buffers, std::vector<int> &actual_sizes,
     bool need_weight) {
   size_t node_num = buffers.size();
@@ -560,10 +854,10 @@ int32_t GraphTable::random_sample_neighbors(
     seq_id[index].emplace_back(idx);
     id_list[index].emplace_back(node_ids[idx], sample_size, need_weight);
   }
-  for (int i = 0; i < seq_id.size(); i++) {
+  for (int i = 0; i < (int)seq_id.size(); i++) {
     if (seq_id[i].size() == 0) continue;
     tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
-      uint64_t node_id;
+      int64_t node_id;
       std::vector<std::pair<SampleKey, SampleResult>> r;
       LRUResponse response = LRUResponse::blocked;
       if (use_cache) {
@@ -576,7 +870,7 @@ int32_t GraphTable::random_sample_neighbors(
       std::vector<SampleKey> sample_keys;
       auto &rng = _shards_task_rng_pool[i];
       for (size_t k = 0; k < id_list[i].size(); k++) {
-        if (index < r.size() &&
+        if (index < (int)r.size() &&
             r[index].first.node_key == id_list[i][k].node_key) {
           idx = seq_id[i][k];
           actual_sizes[idx] = r[index].second.actual_size;
@@ -597,7 +891,7 @@ int32_t GraphTable::random_sample_neighbors(
               res.size() * (need_weight ? (Node::id_size + Node::weight_size)
                                         : Node::id_size);
           int offset = 0;
-          uint64_t id;
+          int64_t id;
           float weight;
           char *buffer_addr = new char[actual_size];
           if (response == LRUResponse::ok) {
@@ -632,13 +926,13 @@ int32_t GraphTable::random_sample_neighbors(
   return 0;
 }
 
-int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
+int32_t GraphTable::get_node_feat(const std::vector<int64_t> &node_ids,
                                   const std::vector<std::string> &feature_names,
                                   std::vector<std::vector<std::string>> &res) {
   size_t node_num = node_ids.size();
   std::vector<std::future<int>> tasks;
   for (size_t idx = 0; idx < node_num; ++idx) {
-    uint64_t node_id = node_ids[idx];
+    int64_t node_id = node_ids[idx];
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, node_id]() -> int {
           Node *node = find_node(node_id);
@@ -646,7 +940,8 @@ int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
           if (node == nullptr) {
             return 0;
           }
-          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+          for (int feat_idx = 0; feat_idx < (int)feature_names.size();
+               ++feat_idx) {
             const std::string &feature_name = feature_names[feat_idx];
             if (feat_id_map.find(feature_name) != feat_id_map.end()) {
               // res[feat_idx][idx] =
@@ -665,19 +960,20 @@ int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
 }
 
 int32_t GraphTable::set_node_feat(
-    const std::vector<uint64_t> &node_ids,
+    const std::vector<int64_t> &node_ids,
     const std::vector<std::string> &feature_names,
     const std::vector<std::vector<std::string>> &res) {
   size_t node_num = node_ids.size();
   std::vector<std::future<int>> tasks;
   for (size_t idx = 0; idx < node_num; ++idx) {
-    uint64_t node_id = node_ids[idx];
+    int64_t node_id = node_ids[idx];
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, node_id]() -> int {
           size_t index = node_id % this->shard_num - this->shard_start;
           auto node = shards[index]->add_feature_node(node_id);
           node->set_feature_size(this->feat_name.size());
-          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+          for (int feat_idx = 0; feat_idx < (int)feature_names.size();
+               ++feat_idx) {
             const std::string &feature_name = feature_names[feat_idx];
             if (feat_id_map.find(feature_name) != feat_id_map.end()) {
               node->set_feature(feat_id_map[feature_name], res[feat_idx][idx]);
@@ -771,35 +1067,68 @@ int32_t GraphTable::pull_graph_list(int start, int total_size,
   return 0;
 }
 
-int32_t GraphTable::get_server_index_by_id(uint64_t id) {
+int32_t GraphTable::get_server_index_by_id(int64_t id) {
   return id % shard_num / shard_num_per_server;
 }
+int32_t GraphTable::initialize(const TableParameter &config,
+                               const FsClientParameter &fs_config) {
+  LOG(INFO) << "in graphTable initialize";
+  _config = config;
+  if (initialize_accessor() != 0) {
+    LOG(WARNING) << "Table accessor initialize failed";
+    return -1;
+  }
 
-int32_t GraphTable::initialize() {
+  if (_afs_client.initialize(fs_config) != 0) {
+    LOG(WARNING) << "Table fs_client initialize failed";
+    // return -1;
+  }
+  auto graph = config.graph_parameter();
+  shard_num = _config.shard_num();
+  LOG(INFO) << "in graphTable initialize over";
+  return initialize(graph);
+}
+int32_t GraphTable::initialize(const GraphParameter &graph) {
+#ifdef PADDLE_WITH_HETERPS
+  if (graph.gpups_mode()) {
+    gpups_mode = true;
+    if (shard_num == 0) {
+      shard_num = graph.gpups_mode_shard_num();
+      server_num = 1;
+      _shard_idx = 0;
+    }
+    auto *sampler =
+        CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class());
+    auto slices =
+        string::split_string<std::string>(graph.gpups_graph_sample_args(), ",");
+    std::cout << "slices" << std::endl;
+    for (auto x : slices) std::cout << x << std::endl;
+    sampler->init(graph.gpu_num(), this, slices);
+    graph_sampler.reset(sampler);
+  }
+#endif
+  task_pool_size_ = graph.task_pool_size();
   _shards_task_pool.resize(task_pool_size_);
   for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
     _shards_task_pool[i].reset(new ::ThreadPool(1));
     _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0));
   }
-  server_num = _shard_num;
-  // VLOG(0) << "in init graph table server num = " << server_num;
-  /*
-  _shard_num is actually server number here
-  when a server initialize its tables, it sets tables' _shard_num to server_num,
-  and _shard_idx to server
-  rank
-  */
-  auto common = _config.common();
-
-  this->table_name = common.table_name();
-  this->table_type = common.name();
+  auto graph_feature = graph.graph_feature();
+  // this->table_name = common.table_name();
+  // this->table_type = common.name();
+  this->table_name = graph.table_name();
+  this->table_type = graph.table_type();
   VLOG(0) << " init graph table type " << this->table_type << " table name "
           << this->table_name;
-  int feat_conf_size = static_cast<int>(common.attributes().size());
+  // int feat_conf_size = static_cast<int>(common.attributes().size());
+  int feat_conf_size = static_cast<int>(graph_feature.name().size());
   for (int i = 0; i < feat_conf_size; i++) {
-    auto &f_name = common.attributes()[i];
-    auto &f_shape = common.dims()[i];
-    auto &f_dtype = common.params()[i];
+    // auto &f_name = common.attributes()[i];
+    // auto &f_shape = common.dims()[i];
+    // auto &f_dtype = common.params()[i];
+    auto &f_name = graph_feature.name()[i];
+    auto &f_shape = graph_feature.shape()[i];
+    auto &f_dtype = graph_feature.dtype()[i];
     this->feat_name.push_back(f_name);
     this->feat_shape.push_back(f_shape);
     this->feat_dtype.push_back(f_dtype);
@@ -807,8 +1136,6 @@ int32_t GraphTable::initialize() {
     VLOG(0) << "init graph table feat conf name:" << f_name
             << " shape:" << f_shape << " dtype:" << f_dtype;
   }
-
-  shard_num = _config.shard_num();
   VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx"
           << _shard_idx;
   shard_num_per_server = sparse_local_shard_num(shard_num, server_num);
@@ -826,5 +1153,6 @@ int32_t GraphTable::initialize() {
 
   return 0;
 }
+
 }  // namespace distributed
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index c76a62248c8fc..f6f127621b947 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -38,10 +38,14 @@
 #include <vector>
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
+#include "paddle/fluid/distributed/ps/table/graph/class_macro.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
+#endif
 namespace paddle {
 namespace distributed {
 class GraphShard {
@@ -51,37 +55,37 @@ class GraphShard {
   ~GraphShard();
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
-  std::vector<uint64_t> get_ids_by_range(int start, int end) {
-    std::vector<uint64_t> res;
+  std::vector<int64_t> get_ids_by_range(int start, int end) {
+    std::vector<int64_t> res;
     for (int i = start; i < end && i < (int)bucket.size(); i++) {
       res.push_back(bucket[i]->get_id());
     }
     return res;
   }
 
-  GraphNode *add_graph_node(uint64_t id);
+  GraphNode *add_graph_node(int64_t id);
   GraphNode *add_graph_node(Node *node);
-  FeatureNode *add_feature_node(uint64_t id);
-  Node *find_node(uint64_t id);
-  void delete_node(uint64_t id);
+  FeatureNode *add_feature_node(int64_t id);
+  Node *find_node(int64_t id);
+  void delete_node(int64_t id);
   void clear();
-  void add_neighbor(uint64_t id, uint64_t dst_id, float weight);
-  std::unordered_map<uint64_t, int> &get_node_location() {
+  void add_neighbor(int64_t id, int64_t dst_id, float weight);
+  std::unordered_map<int64_t, int> &get_node_location() {
     return node_location;
   }
 
  private:
-  std::unordered_map<uint64_t, int> node_location;
+  std::unordered_map<int64_t, int> node_location;
   std::vector<Node *> bucket;
 };
 
 enum LRUResponse { ok = 0, blocked = 1, err = 2 };
 
 struct SampleKey {
-  uint64_t node_key;
+  int64_t node_key;
   size_t sample_size;
   bool is_weighted;
-  SampleKey(uint64_t _node_key, size_t _sample_size, bool _is_weighted)
+  SampleKey(int64_t _node_key, size_t _sample_size, bool _is_weighted)
       : node_key(_node_key),
         sample_size(_sample_size),
         is_weighted(_is_weighted) {}
@@ -300,7 +304,7 @@ class ScaledLRU {
       node_size += lru_pool[i].node_size - lru_pool[i].remove_count;
     }
 
-    if (node_size <= size_t(1.1 * size_limit) + 1) return 0;
+    if ((size_t)node_size <= size_t(1.1 * size_limit) + 1) return 0;
     if (pthread_rwlock_wrlock(&rwlock) == 0) {
       // VLOG(0)<"in shrink\n";
       global_count = 0;
@@ -308,9 +312,9 @@ class ScaledLRU {
         global_count += lru_pool[i].node_size - lru_pool[i].remove_count;
       }
       // VLOG(0)<<"global_count "<<global_count<<"\n";
-      if (global_count > size_limit) {
+      if ((size_t)global_count > size_limit) {
         size_t remove = global_count - size_limit;
-        for (int i = 0; i < lru_pool.size(); i++) {
+        for (size_t i = 0; i < lru_pool.size(); i++) {
           lru_pool[i].total_diff = 0;
           lru_pool[i].remove_count +=
               1.0 * (lru_pool[i].node_size - lru_pool[i].remove_count) /
@@ -352,9 +356,69 @@ class ScaledLRU {
   friend class RandomSampleLRU<K, V>;
 };
 
+#ifdef PADDLE_WITH_HETERPS
+enum GraphSamplerStatus { waiting = 0, running = 1, terminating = 2 };
+class GraphTable;
+class GraphSampler {
+ public:
+  GraphSampler() {
+    status = GraphSamplerStatus::waiting;
+    thread_pool.reset(new ::ThreadPool(1));
+    callback = [](std::vector<paddle::framework::GpuPsCommGraph> &res) {
+      return;
+    };
+  }
+  virtual int run_graph_sampling() = 0;
+  virtual int start_graph_sampling() {
+    if (status != GraphSamplerStatus::waiting) {
+      return -1;
+    }
+    std::promise<int> prom;
+    std::future<int> fut = prom.get_future();
+    graph_sample_task_over = thread_pool->enqueue([&prom, this]() {
+      prom.set_value(0);
+      status = GraphSamplerStatus::running;
+      return run_graph_sampling();
+    });
+    return fut.get();
+  }
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args) = 0;
+  virtual void set_graph_sample_callback(
+      std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+          callback) {
+    this->callback = callback;
+  }
+
+  virtual int end_graph_sampling() {
+    if (status == GraphSamplerStatus::running) {
+      status = GraphSamplerStatus::terminating;
+      return graph_sample_task_over.get();
+    }
+    return -1;
+  }
+  virtual GraphSamplerStatus get_graph_sampler_status() { return status; }
+
+ protected:
+  std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+      callback;
+  std::shared_ptr<::ThreadPool> thread_pool;
+  GraphSamplerStatus status;
+  std::future<int> graph_sample_task_over;
+  std::vector<paddle::framework::GpuPsCommGraph> sample_res;
+};
+#endif
+
 class GraphTable : public SparseTable {
  public:
-  GraphTable() { use_cache = false; }
+  GraphTable() {
+    use_cache = false;
+    shard_num = 0;
+#ifdef PADDLE_WITH_HETERPS
+    gpups_mode = false;
+#endif
+    rw_lock.reset(new pthread_rwlock_t());
+  }
   virtual ~GraphTable();
   virtual int32_t pull_graph_list(int start, int size,
                                   std::unique_ptr<char[]> &buffer,
@@ -362,7 +426,7 @@ class GraphTable : public SparseTable {
                                   int step);
 
   virtual int32_t random_sample_neighbors(
-      uint64_t *node_ids, int sample_size,
+      int64_t *node_ids, int sample_size,
       std::vector<std::shared_ptr<char>> &buffers,
       std::vector<int> &actual_sizes, bool need_weight);
 
@@ -370,9 +434,11 @@ class GraphTable : public SparseTable {
                               int &actual_sizes);
 
   virtual int32_t get_nodes_ids_by_ranges(
-      std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res);
-  virtual int32_t initialize();
-
+      std::vector<std::pair<int, int>> ranges, std::vector<int64_t> &res);
+  virtual int32_t initialize() { return 0; }
+  virtual int32_t initialize(const TableParameter &config,
+                             const FsClientParameter &fs_config);
+  virtual int32_t initialize(const GraphParameter &config);
   int32_t load(const std::string &path, const std::string &param);
   int32_t load_graph_split_config(const std::string &path);
 
@@ -380,13 +446,16 @@ class GraphTable : public SparseTable {
 
   int32_t load_nodes(const std::string &path, std::string node_type);
 
-  int32_t add_graph_node(std::vector<uint64_t> &id_list,
+  int32_t add_graph_node(std::vector<int64_t> &id_list,
                          std::vector<bool> &is_weight_list);
 
-  int32_t remove_graph_node(std::vector<uint64_t> &id_list);
+  int32_t remove_graph_node(std::vector<int64_t> &id_list);
+
+  int32_t get_server_index_by_id(int64_t id);
+  Node *find_node(int64_t id);
 
-  int32_t get_server_index_by_id(uint64_t id);
-  Node *find_node(uint64_t id);
+  virtual int32_t Pull(TableContext &context) { return 0; }
+  virtual int32_t Push(TableContext &context) { return 0; }
 
   virtual int32_t pull_sparse(float *values,
                               const PullSparseValue &pull_value) {
@@ -407,16 +476,27 @@ class GraphTable : public SparseTable {
     return 0;
   }
   virtual int32_t initialize_shard() { return 0; }
-  virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index);
-  virtual uint32_t get_thread_pool_index(uint64_t node_id);
+  virtual int32_t set_shard(size_t shard_idx, size_t server_num) {
+    _shard_idx = shard_idx;
+    /*
+    _shard_num is not used in graph_table, this following operation is for the
+    purpose of
+    being compatible with base class table.
+    */
+    _shard_num = server_num;
+    this->server_num = server_num;
+    return 0;
+  }
+  virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index);
+  virtual uint32_t get_thread_pool_index(int64_t node_id);
   virtual std::pair<int32_t, std::string> parse_feature(std::string feat_str);
 
-  virtual int32_t get_node_feat(const std::vector<uint64_t> &node_ids,
+  virtual int32_t get_node_feat(const std::vector<int64_t> &node_ids,
                                 const std::vector<std::string> &feature_names,
                                 std::vector<std::vector<std::string>> &res);
 
   virtual int32_t set_node_feat(
-      const std::vector<uint64_t> &node_ids,
+      const std::vector<int64_t> &node_ids,
       const std::vector<std::string> &feature_names,
       const std::vector<std::vector<std::string>> &res);
 
@@ -433,11 +513,25 @@ class GraphTable : public SparseTable {
     }
     return 0;
   }
-
+#ifdef PADDLE_WITH_HETERPS
+  virtual int32_t start_graph_sampling() {
+    return this->graph_sampler->start_graph_sampling();
+  }
+  virtual int32_t end_graph_sampling() {
+    return this->graph_sampler->end_graph_sampling();
+  }
+  virtual int32_t set_graph_sample_callback(
+      std::function<void(std::vector<paddle::framework::GpuPsCommGraph> &)>
+          callback) {
+    graph_sampler->set_graph_sample_callback(callback);
+    return 0;
+  }
+// virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); }
+#endif
  protected:
   std::vector<GraphShard *> shards, extra_shards;
   size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
-  const int task_pool_size_ = 24;
+  int task_pool_size_ = 24;
   const int random_sample_nodes_ranges = 3;
 
   std::vector<std::string> feat_name;
@@ -450,11 +544,61 @@ class GraphTable : public SparseTable {
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
   std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
   std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
-  std::unordered_set<uint64_t> extra_nodes;
-  std::unordered_map<uint64_t, size_t> extra_nodes_to_thread_index;
+  std::unordered_set<int64_t> extra_nodes;
+  std::unordered_map<int64_t, size_t> extra_nodes_to_thread_index;
   bool use_cache, use_duplicate_nodes;
   mutable std::mutex mutex_;
+  std::shared_ptr<pthread_rwlock_t> rw_lock;
+#ifdef PADDLE_WITH_HETERPS
+  // paddle::framework::GpuPsGraphTable gpu_graph_table;
+  bool gpups_mode;
+  // std::shared_ptr<::ThreadPool> graph_sample_pool;
+  std::shared_ptr<GraphSampler> graph_sampler;
+  REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler)
+#endif
+};
+
+#ifdef PADDLE_WITH_HETERPS
+REGISTER_PSCORE_REGISTERER(GraphSampler);
+class CompleteGraphSampler : public GraphSampler {
+ public:
+  CompleteGraphSampler() {}
+  ~CompleteGraphSampler() {}
+  // virtual pthread_rwlock_t *export_rw_lock();
+  virtual int run_graph_sampling();
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args_);
+
+ protected:
+  GraphTable *graph_table;
+  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<int64_t>> sample_neighbors;
+  // std::vector<GpuPsCommGraph> sample_res;
+  // std::shared_ptr<std::mt19937_64> random;
+  int gpu_num;
+};
+
+class BasicBfsGraphSampler : public GraphSampler {
+ public:
+  BasicBfsGraphSampler() {}
+  ~BasicBfsGraphSampler() {}
+  // virtual pthread_rwlock_t *export_rw_lock();
+  virtual int run_graph_sampling();
+  virtual void init(size_t gpu_num, GraphTable *graph_table,
+                    std::vector<std::string> args_);
+
+ protected:
+  GraphTable *graph_table;
+  // std::vector<std::vector<GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<paddle::framework::GpuPsGraphNode>> sample_nodes;
+  std::vector<std::vector<int64_t>> sample_neighbors;
+  size_t gpu_num;
+  int node_num_for_each_shard, edge_num_for_each_node;
+  int rounds, interval;
+  std::vector<std::unordered_map<int64_t, std::vector<int64_t>>>
+      sample_neighbors_map;
 };
+#endif
 }  // namespace distributed
 
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.cc b/paddle/fluid/distributed/ps/table/common_sparse_table.cc
index b44d08b937a96..45be53335e1a1 100644
--- a/paddle/fluid/distributed/ps/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_sparse_table.cc
@@ -355,6 +355,32 @@ int32_t CommonSparseTable::pour() {
   return 0;
 }
 
+int32_t CommonSparseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.use_ptr) {
+    char** pull_values = context.pull_context.ptr_values;
+    const uint64_t* keys = context.pull_context.keys;
+    return pull_sparse_ptr(pull_values, keys, context.num);
+  } else {
+    float* pull_values = context.pull_context.values;
+    const PullSparseValue& pull_value = context.pull_context.pull_value;
+    return pull_sparse(pull_values, pull_value);
+  }
+}
+
+int32_t CommonSparseTable::Push(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.pull_context.values != nullptr) {
+    const float* values = context.push_context.values;
+    const uint64_t* keys = context.push_context.keys;
+    return push_sparse(keys, values, context.num);
+  } else {
+    const float** values = context.push_context.ptr_values;
+    const uint64_t* keys = context.push_context.keys;
+    return push_sparse(keys, values, context.num);
+  }
+}
+
 int32_t CommonSparseTable::pull_sparse(float* pull_values,
                                        const PullSparseValue& pull_value) {
   auto shard_num = task_pool_size_;
diff --git a/paddle/fluid/distributed/ps/table/common_sparse_table.h b/paddle/fluid/distributed/ps/table/common_sparse_table.h
index 82481dcd584e4..138c544742066 100644
--- a/paddle/fluid/distributed/ps/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/common_sparse_table.h
@@ -121,6 +121,9 @@ class CommonSparseTable : public SparseTable {
   virtual int32_t push_dense(const float* values, size_t num) { return 0; }
   // unused method end
 
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
+
   virtual int32_t initialize();
   virtual int32_t initialize_shard() { return 0; }
   virtual int32_t initialize_value();
diff --git a/paddle/fluid/distributed/ps/table/common_table.h b/paddle/fluid/distributed/ps/table/common_table.h
index bac826dfe0e20..3d291c0152246 100644
--- a/paddle/fluid/distributed/ps/table/common_table.h
+++ b/paddle/fluid/distributed/ps/table/common_table.h
@@ -119,6 +119,9 @@ class BarrierTable : public Table {
 
   virtual void *get_shard(size_t shard_idx) { return 0; }
 
+  virtual int32_t Pull(TableContext &context) { return 0; }
+  virtual int32_t Push(TableContext &context) { return 0; }
+
   int32_t pull_dense(float *values, size_t num) override { return 0; }
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
index 866bd8114ccea..43e143dca901b 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
@@ -38,6 +38,16 @@ int CtrCommonAccessor::initialize() {
   return 0;
 }
 
+void CtrCommonAccessor::GetTableInfo(AccessorInfo& info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 size_t CtrCommonAccessor::dim() { return common_feature_value.dim(); }
 
 size_t CtrCommonAccessor::dim_size(size_t dim) {
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h
index 1e31fec04649b..bc46217955a8a 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h
@@ -126,6 +126,7 @@ class CtrCommonAccessor : public ValueAccessor {
   virtual int initialize();
   virtual ~CtrCommonAccessor() {}
 
+  virtual void GetTableInfo(AccessorInfo& info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
index b07bcf70ad7af..bccf1fdebafa0 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
@@ -37,6 +37,16 @@ int DownpourCtrDoubleAccessor::initialize() {
   return 0;
 }
 
+void DownpourCtrDoubleAccessor::GetTableInfo(AccessorInfo& info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 size_t DownpourCtrDoubleAccessor::dim() {
   auto embedx_dim = _config.embedx_dim();
   return DownpourCtrDoubleFeatureValue::dim(embedx_dim);
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
index d7c717ace0988..d7942634e8600 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
@@ -168,6 +168,7 @@ class DownpourCtrDoubleAccessor : public ValueAccessor {
   DownpourCtrDoubleAccessor() {}
   virtual ~DownpourCtrDoubleAccessor() {}
   virtual int initialize();
+  virtual void GetTableInfo(AccessorInfo& info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/depends/sparse_utils.h b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h
index 708f7786bf3b0..98e0250acc4d6 100644
--- a/paddle/fluid/distributed/ps/table/depends/sparse_utils.h
+++ b/paddle/fluid/distributed/ps/table/depends/sparse_utils.h
@@ -58,7 +58,7 @@ struct PullSparseValue {
                std::vector<int>* offset_shard) const {
     offset_shard->reserve(numel_ / shard_num + 1);
     for (int x = 0; x < numel_; ++x) {
-      if (feasigns_[x] % shard_num == shard_id) {
+      if (int(feasigns_[x] % shard_num) == shard_id) {
         offset_shard->push_back(x);
       }
     }
diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc
index 5f22c3a436f1f..e8ca7430351de 100644
--- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.cc
@@ -37,6 +37,16 @@ int DownpourCtrAccessor::initialize() {
   return 0;
 }
 
+void DownpourCtrAccessor::GetTableInfo(AccessorInfo& info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 size_t DownpourCtrAccessor::dim() {
   auto embedx_dim = _config.embedx_dim();
   return DownpourCtrFeatureValue::dim(embedx_dim);
diff --git a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h
index 5de7b12e01f0d..11991ad044ff6 100644
--- a/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/downpour_ctr_accessor.h
@@ -160,6 +160,7 @@ class DownpourCtrAccessor : public ValueAccessor {
   virtual ~DownpourCtrAccessor() {}
 
   virtual int initialize();
+  virtual void GetTableInfo(AccessorInfo& info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/graph/class_macro.h b/paddle/fluid/distributed/ps/table/graph/class_macro.h
new file mode 100644
index 0000000000000..bf59dbacb2537
--- /dev/null
+++ b/paddle/fluid/distributed/ps/table/graph/class_macro.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#define DECLARE_GRAPH_FRIEND_CLASS(a) friend class a;
+#define DECLARE_1_FRIEND_CLASS(a, ...) DECLARE_GRAPH_FRIEND_CLASS(a)
+#define DECLARE_2_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_1_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_3_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_2_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_4_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_3_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_5_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_4_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_6_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_5_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_7_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_6_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_8_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_7_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_9_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_8_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_10_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_9_FRIEND_CLASS(__VA_ARGS__)
+#define DECLARE_11_FRIEND_CLASS(a, ...) \
+  DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_10_FRIEND_CLASS(__VA_ARGS__)
+#define REGISTER_GRAPH_FRIEND_CLASS(n, ...) \
+  DECLARE_##n##_FRIEND_CLASS(__VA_ARGS__)
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
index d1961b655d882..004a536e8e56c 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc
@@ -17,11 +17,11 @@
 namespace paddle {
 namespace distributed {
 
-void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+void GraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   id_arr.push_back(id);
 }
 
-void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) {
   id_arr.push_back(id);
   weight_arr.push_back(weight);
 }
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
index 3dfe5a6f357a7..5fc785fe25682 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
@@ -24,19 +24,20 @@ class GraphEdgeBlob {
   GraphEdgeBlob() {}
   virtual ~GraphEdgeBlob() {}
   size_t size() { return id_arr.size(); }
-  virtual void add_edge(uint64_t id, float weight);
-  uint64_t get_id(int idx) { return id_arr[idx]; }
+  virtual void add_edge(int64_t id, float weight);
+  int64_t get_id(int idx) { return id_arr[idx]; }
   virtual float get_weight(int idx) { return 1; }
+  std::vector<int64_t>& export_id_array() { return id_arr; }
 
  protected:
-  std::vector<uint64_t> id_arr;
+  std::vector<int64_t> id_arr;
 };
 
 class WeightedGraphEdgeBlob : public GraphEdgeBlob {
  public:
   WeightedGraphEdgeBlob() {}
   virtual ~WeightedGraphEdgeBlob() {}
-  virtual void add_edge(uint64_t id, float weight);
+  virtual void add_edge(int64_t id, float weight);
   virtual float get_weight(int idx) { return weight_arr[idx]; }
 
  protected:
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h
index b838c2c1258d8..c6c594036d4fc 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -48,6 +48,7 @@ class Node {
   virtual void set_feature(int idx, std::string str) {}
   virtual void set_feature_size(int size) {}
   virtual int get_feature_size() { return 0; }
+  virtual size_t get_neighbor_size() { return 0; }
 
  protected:
   uint64_t id;
@@ -70,6 +71,7 @@ class GraphNode : public Node {
   }
   virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
   virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
+  virtual size_t get_neighbor_size() { return edges->size(); }
 
  protected:
   Sampler *sampler;
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
index 89c4fc15ae279..3b43f99543fdd 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h
@@ -48,6 +48,8 @@ class MemorySparseGeoTable : public SparseTable {
   virtual int32_t save(const std::string& path, const std::string& param) {
     return 0;
   }
+  virtual int32_t Pull(TableContext& context) { return 0; }
+  virtual int32_t Push(TableContext& context) { return 0; }
   virtual int32_t flush() { return 0; }
   virtual int32_t shrink(const std::string& param) { return 0; }
   virtual void clear() { return; }
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
index 7ce6e9005cf56..98454ca747d31 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -390,6 +390,26 @@ std::pair<int64_t, int64_t> MemorySparseTable::print_table_stat() {
   return {feasign_size, mf_size};
 }
 
+int32_t MemorySparseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.use_ptr) {
+    char** pull_values = context.pull_context.ptr_values;
+    const uint64_t* keys = context.pull_context.keys;
+    return pull_sparse_ptr(pull_values, keys, context.num);
+  } else {
+    float* pull_values = context.pull_context.values;
+    const PullSparseValue& pull_value = context.pull_context.pull_value;
+    return pull_sparse(pull_values, pull_value);
+  }
+}
+
+int32_t MemorySparseTable::Push(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+
+  const uint64_t* keys = context.push_context.keys;
+  return push_sparse(keys, context.push_context.ptr_values, context.num);
+}
+
 int32_t MemorySparseTable::pull_sparse(float* pull_values,
                                        const PullSparseValue& pull_value) {
   CostTimer timer("pserver_sparse_select_all");
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
index 5770f25f8f41d..d26c67319760d 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
@@ -48,6 +48,9 @@ class MemorySparseTable : public SparseTable {
   virtual int32_t push_dense(const float* values, size_t num) { return 0; }
   // unused method end
 
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
+
   virtual int32_t initialize();
   virtual int32_t initialize_shard() { return 0; }
   virtual int32_t initialize_value();
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index 60514b4e19ffa..5bc58bc5a1108 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -61,6 +61,21 @@ int32_t SSDSparseTable::initialize() {
   return 0;
 }
 
+int32_t SSDSparseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.use_ptr) {
+    char** pull_values = context.pull_context.ptr_values;
+    const uint64_t* keys = context.pull_context.keys;
+    return pull_sparse_ptr(pull_values, keys, context.num);
+  } else {
+    float* pull_values = context.pull_context.values;
+    const PullSparseValue& pull_value = context.pull_context.pull_value;
+    return pull_sparse(pull_values, pull_value);
+  }
+}
+
+int32_t SSDSparseTable::Push(TableContext& context) { return 0; }
+
 int32_t SSDSparseTable::pull_sparse(float* pull_values,
                                     const PullSparseValue& pull_value) {
   auto shard_num = task_pool_size_;
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
index f5e8a7067e0e0..3a703d7d966d3 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
@@ -42,6 +42,9 @@ class SSDSparseTable : public CommonSparseTable {
   // exchange data
   virtual int32_t update_table();
 
+  virtual int32_t Pull(TableContext& context);
+  virtual int32_t Push(TableContext& context);
+
   virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
 
   virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys,
diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc
index fa8169da07ab7..fc2ea56e95d77 100644
--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -37,6 +37,8 @@ REGISTER_PSCORE_CLASS(Table, CommonDenseTable);
 REGISTER_PSCORE_CLASS(Table, CommonSparseTable);
 #ifdef PADDLE_WITH_HETERPS
 REGISTER_PSCORE_CLASS(Table, SSDSparseTable);
+REGISTER_PSCORE_CLASS(GraphSampler, CompleteGraphSampler);
+REGISTER_PSCORE_CLASS(GraphSampler, BasicBfsGraphSampler);
 #endif
 REGISTER_PSCORE_CLASS(Table, SparseGeoTable);
 REGISTER_PSCORE_CLASS(Table, BarrierTable);
diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h
index da1bb668ccfa3..2bd2a42b6c58f 100644
--- a/paddle/fluid/distributed/ps/table/table.h
+++ b/paddle/fluid/distributed/ps/table/table.h
@@ -32,6 +32,30 @@
 
 namespace paddle {
 namespace distributed {
+
+enum ValueType { Sparse = 0, Dense = 1 };
+
+struct PullContext {
+  const uint64_t *keys;
+  const PullSparseValue pull_value;
+  float *values;
+  char **ptr_values;
+};
+
+struct TablePushContext {
+  const uint64_t *keys;
+  const float *values;
+  const float **ptr_values;
+};
+
+struct TableContext {
+  ValueType value_type;
+  PullContext pull_context;
+  TablePushContext push_context;
+  size_t num;
+  bool use_ptr;
+};
+
 class Table {
  public:
   Table() {}
@@ -39,6 +63,8 @@ class Table {
   virtual int32_t initialize(const TableParameter &config,
                              const FsClientParameter &fs_config);
 
+  virtual int32_t Pull(TableContext &context) = 0;
+  virtual int32_t Push(TableContext &context) = 0;
   virtual int32_t pull_dense(float *values, size_t num) = 0;
   virtual int32_t push_dense(const float *values, size_t num) = 0;
   // for push global_step
diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.cc b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
index 70a580c1e53a9..8c5349bff832c 100644
--- a/paddle/fluid/distributed/ps/table/tensor_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.cc
@@ -20,6 +20,16 @@ namespace distributed {
 
 int CommMergeAccessor::initialize() { return 0; }
 
+void CommMergeAccessor::GetTableInfo(AccessorInfo &info) {
+  info.dim = dim();
+  info.size = size();
+  info.select_dim = select_dim();
+  info.select_size = select_size();
+  info.update_dim = update_dim();
+  info.update_size = update_size();
+  info.fea_dim = fea_dim();
+}
+
 // value 维度
 size_t CommMergeAccessor::dim() { return 0; }
 
diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h
index 5041b8fdf8733..1873b743b44ec 100644
--- a/paddle/fluid/distributed/ps/table/tensor_accessor.h
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h
@@ -30,6 +30,7 @@ class CommMergeAccessor : public ValueAccessor {
   CommMergeAccessor() {}
   virtual ~CommMergeAccessor() {}
   virtual int initialize();
+  virtual void GetTableInfo(AccessorInfo &info);
   // value维度
   virtual size_t dim();
   // value各个维度的size
diff --git a/paddle/fluid/distributed/ps/table/tensor_table.h b/paddle/fluid/distributed/ps/table/tensor_table.h
index 64d81327acc55..23a62365c0f5a 100644
--- a/paddle/fluid/distributed/ps/table/tensor_table.h
+++ b/paddle/fluid/distributed/ps/table/tensor_table.h
@@ -48,6 +48,8 @@ class TensorTable : public Table {
   TensorTable() {}
   virtual ~TensorTable() {}
 
+  virtual int32_t Pull(TableContext &context) { return 0; }
+  virtual int32_t Push(TableContext &context) { return 0; }
   int32_t pull_dense(float *values, size_t num) override { return 0; }
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index 0588dbdf0fc61..c887cfeb71eef 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -30,6 +30,32 @@ bool FleetWrapper::is_initialized_ = false;
 
 std::shared_ptr<paddle::distributed::PSCore> FleetWrapper::pserver_ptr_ = NULL;
 
+void FleetWrapper::Stop() { StopServer(); }
+
+void FleetWrapper::Load(WrapperContext& context) {
+  auto table_id = context.table_id;
+  if (table_id >= 0 && context.meta != "") {
+    LoadSparseOnServer(context.path, context.meta, context.table_id);
+    return;
+  }
+  if (table_id < 0) {  // laod all
+    LoadModel(context.path, context.mode);
+  } else {  // load one table
+    LoadModelOneTable(table_id, context.path, context.mode);
+  }
+  return;
+}
+
+void FleetWrapper::Save(WrapperContext& context) {
+  auto table_id = context.table_id;
+  if (table_id < 0) {
+    SaveModel(context.path, context.mode);
+  } else {
+    SaveModelOneTable(table_id, context.path, context.mode);
+  }
+  return;
+}
+
 void FleetWrapper::SetClient2ClientConfig(int request_timeout_ms,
                                           int connect_timeout_ms,
                                           int max_retry) {
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h
index a535b8c5bf8f9..d68c453c6d51b 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.h
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.h
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
+#include "paddle/fluid/distributed/ps/wrapper/ps_wrapper.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/framework/io/shell.h"
@@ -54,7 +55,7 @@ using framework::Variable;
 
 using RpcCtxMap = std::unordered_map<std::string, CommContext>;
 
-class FleetWrapper {
+class FleetWrapper : public PSWrapper {
  public:
   virtual ~FleetWrapper() {}
   FleetWrapper() {
@@ -68,7 +69,13 @@ class FleetWrapper {
     // pserver request max retry
     client2client_max_retry_ = 3;
   }
+  virtual int32_t Initialize(InitContext& context) { return 0; }
 
+  virtual void Stop() override;
+
+  virtual void Load(WrapperContext& context) override;
+
+  virtual void Save(WrapperContext& context) override;
   // set client to client communication config
   void SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms,
                               int max_retry);
diff --git a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
index c92835aa995ad..ca02ad31195ef 100755
--- a/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
+++ b/paddle/fluid/distributed/ps/wrapper/ps_wrapper.h
@@ -1,18 +1,84 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
-#define PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
-
-#endif  // PADDLE_FLUID_DISTRIBUTED_PS_WRAPPER_PS_WRAPPER_H_
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <ctime>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
+#include "paddle/fluid/distributed/ps/service/ps_service/service.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/io/shell.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace framework {
+class Scope;
+class SelectedRows;
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace distributed {
+
+class PSCore;
+
+using framework::LoDTensor;
+using framework::Scope;
+using phi::SelectedRows;
+using framework::Variable;
+
+using RpcCtxMap = std::unordered_map<std::string, CommContext>;
+
+struct WrapperContext {
+  uint32_t table_id;
+  const std::string path;
+  const int mode;
+  const std::string meta;
+};
+
+struct InitContext {
+  const std::vector<int> dev_ids;  // for gpu
+};
+
+class PSWrapper {
+ public:
+  virtual ~PSWrapper() {}
+  PSWrapper() {}
+  // init server
+
+  virtual int32_t Initialize(InitContext& context) = 0;
+
+  virtual void Stop() = 0;
+
+  virtual void Load(WrapperContext& context) = 0;
+
+  virtual void Save(WrapperContext& context) = 0;
+};
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 2223334ccc442..cb46c38d4de4b 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -24,6 +24,9 @@ cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope serv
 set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
 
+set_source_files_properties(graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS  scope server communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+
 set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table)
 
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index 9949dce4e933b..a2f495de3c953 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -236,7 +236,7 @@ void RunGraphSplit() {
   sleep(2);
   std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
   dense_regions.insert(
-      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+      std::pair<int64_t, std::vector<paddle::distributed::Region>>(0, {}));
   auto regions = dense_regions[0];
 
   RunClient(dense_regions, 0, pserver_ptr_->get_service());
@@ -250,16 +250,16 @@ void RunGraphSplit() {
       worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
   srand(time(0));
   pull_status.wait();
-  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<int64_t>> _vs;
   std::vector<std::vector<float>> vs;
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 10240001024), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(0, _vs[0].size());
   _vs.clear();
   vs.clear();
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 97), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 97), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(3, _vs[0].size());
   std::remove(edge_file_name);
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 22c2d1e60992e..565d51379d5a8 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -48,10 +48,10 @@ namespace distributed = paddle::distributed;
 
 void testSampleNodes(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<uint64_t> ids;
+  std::vector<int64_t> ids;
   auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids);
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {37, 59};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {37, 59};
   pull_status.wait();
   for (auto id : ids) s.insert(id);
   ASSERT_EQ(true, s.size() == s1.size());
@@ -106,14 +106,14 @@ void testFeatureNodeSerializeFloat64() {
 
 void testSingleSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<std::vector<uint64_t>> vs;
+  std::vector<std::vector<int64_t>> vs;
   std::vector<std::vector<float>> vs1;
   auto pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 37), 4, vs, vs1, true);
+      0, std::vector<int64_t>(1, 37), 4, vs, vs1, true);
   pull_status.wait();
 
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {112, 45, 145};
   for (auto g : vs[0]) {
     s.insert(g);
   }
@@ -126,7 +126,7 @@ void testSingleSampleNeighboor(
   vs.clear();
   vs1.clear();
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 96), 4, vs, vs1, true);
+      0, std::vector<int64_t>(1, 96), 4, vs, vs1, true);
   pull_status.wait();
   s1 = {111, 48, 247};
   for (auto g : vs[0]) {
@@ -147,30 +147,30 @@ void testAddNode(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
   worker_ptr_->clear_nodes(0);
   int total_num = 270000;
-  uint64_t id;
-  std::unordered_set<uint64_t> id_set;
+  int64_t id;
+  std::unordered_set<int64_t> id_set;
   for (int i = 0; i < total_num; i++) {
     while (id_set.find(id = rand()) != id_set.end())
       ;
     id_set.insert(id);
   }
-  std::vector<uint64_t> id_list(id_set.begin(), id_set.end());
+  std::vector<int64_t> id_list(id_set.begin(), id_set.end());
   std::vector<bool> weight_list;
   auto status = worker_ptr_->add_graph_node(0, id_list, weight_list);
   status.wait();
-  std::vector<uint64_t> ids[2];
+  std::vector<int64_t> ids[2];
   for (int i = 0; i < 2; i++) {
     auto sample_status =
         worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
     sample_status.wait();
   }
-  std::unordered_set<uint64_t> id_set_check(ids[0].begin(), ids[0].end());
+  std::unordered_set<int64_t> id_set_check(ids[0].begin(), ids[0].end());
   for (auto x : ids[1]) id_set_check.insert(x);
   ASSERT_EQ(id_set.size(), id_set_check.size());
   for (auto x : id_set) {
     ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true);
   }
-  std::vector<uint64_t> remove_ids;
+  std::vector<int64_t> remove_ids;
   for (auto p : id_set_check) {
     if (remove_ids.size() == 0)
       remove_ids.push_back(p);
@@ -187,7 +187,7 @@ void testAddNode(
         worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]);
     sample_status.wait();
   }
-  std::unordered_set<uint64_t> id_set_check1(ids[0].begin(), ids[0].end());
+  std::unordered_set<int64_t> id_set_check1(ids[0].begin(), ids[0].end());
   for (auto x : ids[1]) id_set_check1.insert(x);
   ASSERT_EQ(id_set_check1.size(), id_set_check.size());
   for (auto x : id_set_check1) {
@@ -196,14 +196,14 @@ void testAddNode(
 }
 void testBatchSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
-  std::vector<std::vector<uint64_t>> vs;
+  std::vector<std::vector<int64_t>> vs;
   std::vector<std::vector<float>> vs1;
-  std::vector<std::uint64_t> v = {37, 96};
+  std::vector<std::int64_t> v = {37, 96};
   auto pull_status =
       worker_ptr_->batch_sample_neighbors(0, v, 4, vs, vs1, false);
   pull_status.wait();
-  std::unordered_set<uint64_t> s;
-  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  std::unordered_set<int64_t> s;
+  std::unordered_set<int64_t> s1 = {112, 45, 145};
   for (auto g : vs[0]) {
     s.insert(g);
   }
@@ -417,7 +417,7 @@ void RunBrpcPushSparse() {
 
   std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
   dense_regions.insert(
-      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+      std::pair<int64_t, std::vector<paddle::distributed::Region>>(0, {}));
   auto regions = dense_regions[0];
 
   RunClient(dense_regions, 0, pserver_ptr_->get_service());
@@ -427,14 +427,14 @@ void RunBrpcPushSparse() {
       worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
   srand(time(0));
   pull_status.wait();
-  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<int64_t>> _vs;
   std::vector<std::vector<float>> vs;
   testSampleNodes(worker_ptr_);
   sleep(5);
   testSingleSampleNeighboor(worker_ptr_);
   testBatchSampleNeighboor(worker_ptr_);
   pull_status = worker_ptr_->batch_sample_neighbors(
-      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+      0, std::vector<int64_t>(1, 10240001024), 4, _vs, vs, true);
   pull_status.wait();
   ASSERT_EQ(0, _vs[0].size());
   paddle::distributed::GraphTable* g =
@@ -445,14 +445,14 @@ void RunBrpcPushSparse() {
   while (round--) {
     vs.clear();
     pull_status = worker_ptr_->batch_sample_neighbors(
-        0, std::vector<uint64_t>(1, 37), 1, _vs, vs, false);
+        0, std::vector<int64_t>(1, 37), 1, _vs, vs, false);
     pull_status.wait();
 
     for (int i = 0; i < ttl; i++) {
-      std::vector<std::vector<uint64_t>> vs1;
+      std::vector<std::vector<int64_t>> vs1;
       std::vector<std::vector<float>> vs2;
       pull_status = worker_ptr_->batch_sample_neighbors(
-          0, std::vector<uint64_t>(1, 37), 1, vs1, vs2, false);
+          0, std::vector<int64_t>(1, 37), 1, vs1, vs2, false);
       pull_status.wait();
       ASSERT_EQ(_vs[0].size(), vs1[0].size());
 
@@ -540,7 +540,7 @@ void RunBrpcPushSparse() {
 
   // Test Pull by step
 
-  std::unordered_set<uint64_t> count_item_nodes;
+  std::unordered_set<int64_t> count_item_nodes;
   // pull by step 2
   for (int test_step = 1; test_step < 4; test_step++) {
     count_item_nodes.clear();
@@ -558,18 +558,18 @@ void RunBrpcPushSparse() {
     ASSERT_EQ(count_item_nodes.size(), 12);
   }
 
-  std::pair<std::vector<std::vector<uint64_t>>, std::vector<float>> res;
+  std::pair<std::vector<std::vector<int64_t>>, std::vector<float>> res;
   res = client1.batch_sample_neighbors(
-      std::string("user2item"), std::vector<uint64_t>(1, 96), 4, true, false);
+      std::string("user2item"), std::vector<int64_t>(1, 96), 4, true, false);
   ASSERT_EQ(res.first[0].size(), 3);
-  std::vector<uint64_t> node_ids;
+  std::vector<int64_t> node_ids;
   node_ids.push_back(96);
   node_ids.push_back(37);
   res = client1.batch_sample_neighbors(std::string("user2item"), node_ids, 4,
                                        true, false);
 
   ASSERT_EQ(res.first[1].size(), 1);
-  std::vector<uint64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
+  std::vector<int64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
   ASSERT_EQ(nodes_ids.size(), 2);
   ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) ||
                       (nodes_ids[0] == 37 && nodes_ids[1] == 59));
diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc
new file mode 100644
index 0000000000000..65455028247dd
--- /dev/null
+++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_set>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include <chrono>
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+
+std::vector<std::string> edges = {
+    std::string("37\t45\t0.34"),  std::string("37\t145\t0.31"),
+    std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
+    std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
+    std::string("59\t45\t0.34"),  std::string("59\t145\t0.31"),
+    std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
+    std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
+// odd id:96 48 122 112
+char edge_file_name[] = "edges.txt";
+
+std::vector<std::string> nodes = {
+    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
+    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
+    std::string("user\t59\ta 0.11\tb 11 14"),
+    std::string("user\t97\ta 0.11\tb 12 11"),
+    std::string("item\t45\ta 0.21"),
+    std::string("item\t145\ta 0.21"),
+    std::string("item\t112\ta 0.21"),
+    std::string("item\t48\ta 0.21"),
+    std::string("item\t247\ta 0.21"),
+    std::string("item\t111\ta 0.21"),
+    std::string("item\t46\ta 0.21"),
+    std::string("item\t146\ta 0.21"),
+    std::string("item\t122\ta 0.21"),
+    std::string("item\t49\ta 0.21"),
+    std::string("item\t248\ta 0.21"),
+    std::string("item\t113\ta 0.21")};
+char node_file_name[] = "nodes.txt";
+
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+
+void testGraphSample() {
+#ifdef PADDLE_WITH_HETERPS
+  ::paddle::distributed::GraphParameter table_proto;
+  table_proto.set_gpups_mode(true);
+  table_proto.set_gpups_mode_shard_num(127);
+  table_proto.set_gpu_num(2);
+
+  distributed::GraphTable graph_table, graph_table1;
+  graph_table.initialize(table_proto);
+  prepare_file(edge_file_name, edges);
+  graph_table.load(std::string(edge_file_name), std::string("e>"));
+  std::vector<paddle::framework::GpuPsCommGraph> res;
+  std::promise<int> prom;
+  std::future<int> fut = prom.get_future();
+  graph_table.set_graph_sample_callback(
+      [&res, &prom](std::vector<paddle::framework::GpuPsCommGraph> &res0) {
+        res = res0;
+        prom.set_value(0);
+      });
+  graph_table.start_graph_sampling();
+  fut.get();
+  graph_table.end_graph_sampling();
+  ASSERT_EQ(2, res.size());
+  // 37 59 97
+  for (int i = 0; i < (int)res[1].node_size; i++) {
+    std::cout << res[1].node_list[i].node_id << std::endl;
+  }
+  ASSERT_EQ(3, res[1].node_size);
+
+  ::paddle::distributed::GraphParameter table_proto1;
+  table_proto1.set_gpups_mode(true);
+  table_proto1.set_gpups_mode_shard_num(127);
+  table_proto1.set_gpu_num(2);
+  table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler");
+  table_proto1.set_gpups_graph_sample_args("5,5,1,1");
+  graph_table1.initialize(table_proto1);
+  graph_table1.load(std::string(edge_file_name), std::string("e>"));
+  std::vector<paddle::framework::GpuPsCommGraph> res1;
+  std::promise<int> prom1;
+  std::future<int> fut1 = prom1.get_future();
+  graph_table1.set_graph_sample_callback(
+      [&res1, &prom1](std::vector<paddle::framework::GpuPsCommGraph> &res0) {
+        res1 = res0;
+        prom1.set_value(0);
+      });
+  graph_table1.start_graph_sampling();
+  fut1.get();
+  graph_table1.end_graph_sampling();
+  // distributed::BasicBfsGraphSampler *sampler1 =
+  //     (distributed::BasicBfsGraphSampler *)graph_table1.get_graph_sampler();
+  //     sampler1->start_graph_sampling();
+  //     std::this_thread::sleep_for (std::chrono::seconds(1));
+  // std::vector<paddle::framework::GpuPsCommGraph> res1;// =
+  // sampler1->fetch_sample_res();
+  ASSERT_EQ(2, res1.size());
+  // odd id:96 48 122 112
+  for (int i = 0; i < (int)res1[0].node_size; i++) {
+    std::cout << res1[0].node_list[i].node_id << std::endl;
+  }
+  ASSERT_EQ(4, res1[0].node_size);
+#endif
+}
+
+TEST(testGraphSample, Run) { testGraphSample(); }
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 3a2ec403c0a59..10696dbacd35b 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -24,7 +24,7 @@
 #include "paddle/fluid/platform/errors.h"
 
 #include "glog/logging.h"
-
+DECLARE_bool(retain_grad_for_all_tensor);
 namespace egr {
 
 static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
@@ -40,7 +40,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
 
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
 operator()(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+    std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
+    bool create_graph) {
   VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
@@ -62,7 +63,7 @@ operator()(
     grad_out = grads[0][0];
   }
 
-  if (!weak_grad_.expired()) {
+  if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) {
     auto grad = weak_grad_.lock();
     CopyOrAddTensor(grad.get(), grad_out);
   }
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 07fa40165167c..2e38d7e9e91e2 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override;
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
+      bool create_graph = false) override;
+
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
 
   std::string name() { return "GradNodeAccumulation"; }
 
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 5a2595b9103e4..d9f5447a88e9b 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -146,7 +146,8 @@ void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
 
 std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
 operator()(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+    std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
+    bool create_graph) {
   // 1. Check Output Size
   PADDLE_ENFORCE(
       ((grads.size() == 1) && (grads[0].size() == 1)),
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index 247fde6ed1f86..0b942d2a06707 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override;
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
+      bool create_graph = false) override;
+
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
 
   void SetTensorWrappers_X(
       const std::vector<paddle::experimental::Tensor>& tensors);
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
index ba6a936d68651..1be3b31de00a6 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
@@ -86,9 +86,9 @@ paddle::experimental::Tensor scale(const paddle::experimental::Tensor& x,
     scale_node->SetTensorWrappers_X({x});
 
     // Set Grad out rank as same as fwd input and set stop gradient to bwd
-    scale_node->SetGradOutMeta(p_autograd_in, /*slot id*/ 0);
+    scale_node->SetGradOutMeta(x, /*slot id*/ 0);
     // Set Grad out rank as same as fwd input and set stop gradient to bwd
-    scale_node->SetGradInMeta(p_autograd_out, /*slot id*/ 0);
+    scale_node->SetGradInMeta(out, /*slot id*/ 0);
 
     // Set History for output set current Grad Node for
     EagerUtils::SetHistory(p_autograd_out, scale_node);
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index 77c39d1b0a37c..81ea92d1c3c48 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -30,7 +30,8 @@ namespace egr_utils_api {
 
 bool IsLeafTensor(const paddle::experimental::Tensor& target) {
   std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(target);
-  if (std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node)) {
+  if (!grad_node ||
+      std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node)) {
     return true;
   }
 
@@ -42,8 +43,7 @@ paddle::experimental::Tensor CreateTensorWithValue(
     const phi::DataType& dtype, const phi::DataLayout& layout, float value,
     bool is_leaf) {
   paddle::experimental::Tensor out = paddle::experimental::full(
-      phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype,
-      phi::TransToPhiBackend(place));
+      phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype, place);
 
   auto meta = EagerUtils::autograd_meta(&out);
   if (is_leaf) {
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index dc79a8a45a246..229817596423c 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -47,6 +47,9 @@ std::unordered_map<std::string, std::vector<std::string>>
 static std::unordered_map<std::string, paddle::framework::AttributeMap>
     operators_with_attrs = {};
 
+static std::unordered_set<std::string> ops_to_fill_zero_for_empty_grads = {
+    "split"};
+
 /* --- Black Ops list that's NO NEED to apply code generation --- */
 static std::unordered_set<std::string> black_ops_list = {"run_program"};
 
@@ -56,23 +59,29 @@ static std::string LegalizeVariableName(const std::string& var_name) {
   return ret;
 }
 
-static bool IgnoreGradAttribute(const std::string& op_type,
-                                const std::string& attr_name) {
-  // Attributes in operators_with_attrs are created manually during code
-  // generation
-  // We should ignore these arbitrary attrs when setting up grad attribute map
-  if (operators_with_attrs.count(op_type)) {
-    if (operators_with_attrs[op_type].count(attr_name)) {
-      return true;
-    }
-  }
+static std::string HandleDynamicGradAttributes(const std::string& fwd_op_type,
+                                               const std::string& attrs_name) {
+  std::string additional_grad_attrs_str = "";
+
+  if (fwd_op_type == "sum") {
+    const char* GRAD_ATTRS_TEMPLATE = "  %s[\"%s\"] = %s;\n";
+    additional_grad_attrs_str = paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "scale", "float(1.0)");
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)");
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)");
 
-  // Only allow SumOp
-  if (op_type != "sum") {
-    return true;
+  } else if (fwd_op_type == "scale") {
+    const char* GRAD_ATTRS_TEMPLATE = "  %s[\"%s\"] = %s;\n";
+
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)");
+    additional_grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)");
   }
 
-  return false;
+  return additional_grad_attrs_str;
 }
 
 static void PrepareAttrMapForOps() {
@@ -973,7 +982,9 @@ static bool CollectGradInformationFromOpInfo(
 /* --------------------------------------------------- */
 static std::string GenerateGradNodeCreationContent(
     const ForwardGenerationInfo& fwd_info,
-    const GradNodeGenerationInfo& bwd_info) {
+    const GradNodeGenerationInfo& bwd_info,
+    const std::string& trace_op_body_str,
+    std::map<std::string, std::string> inplace_map = {}) {
   VLOG(6) << "Generating GradNode Creation codes";
 
   const std::string& op_type = fwd_info.GetOpType();
@@ -992,7 +1003,8 @@ static std::string GenerateGradNodeCreationContent(
   // If single output slotname and not duplicable,
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
-  std::string get_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  std::string get_input_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  std::string get_output_autograd_meta_str = "";
   // If single output slotname and not duplicable,
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
@@ -1000,22 +1012,39 @@ static std::string GenerateGradNodeCreationContent(
     const std::string& output_name = output.name();
     const std::string& output_autograd_name = "p_autograd_" + output_name;
 
+    // output autograd_meta should be got after running TraceOP.
     if (output.duplicable()) {
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
-          "  std::vector<egr::AutogradMeta*> %s = "
+          "    std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_output_autograd_meta_str += paddle::string::Sprintf(
           GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
     } else {
-      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
-          "  egr::AutogradMeta* %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+      // In inplace op, the case where output is duplicable is not considered.
+      // Replace output directly with input in inplace op.
+      if (!inplace_map.empty() && inplace_map.count(output_name)) {
+        auto inplace_input_name = inplace_map[output_name];
+        const std::string& inplace_input_autograd_name =
+            "p_autograd_" + inplace_input_name;
+        const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+            "    %s = egr::EagerUtils::autograd_meta(&%s);\n";
+        get_output_autograd_meta_str += paddle::string::Sprintf(
+            GET_SINGLE_AUTOGRAD_META_TEMPLATE, inplace_input_autograd_name,
+            inplace_input_name);
+      } else {
+        const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+            "    egr::AutogradMeta* %s = "
+            "egr::EagerUtils::autograd_meta(&%s);\n";
+        get_output_autograd_meta_str +=
+            paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE,
+                                    output_autograd_name, output_name);
+      }
     }
   }
   VLOG(6) << "Generated outputs autograd_meta";
 
+  // input autograd_meta should be got before running TraceOP (for checking
+  // inplace).
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
@@ -1024,28 +1053,46 @@ static std::string GenerateGradNodeCreationContent(
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
           "  std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
 
     } else if (input.dispensable()) {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
 
     } else {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
     }
   }
   VLOG(6) << "Generated inputs autograd_meta";
 
+  // check inplace input to avoid inplace operations on leaf nodes with
+  // stop_gradient=False.
+  std::string check_inplace_str = "";
+  if (!inplace_map.empty()) {
+    const char* CHECKING_INPLACE_TEMPLATE =
+        "  // Check Inplace\n"
+        "  egr::EagerUtils::CheckInplace(%s, p_autograd_%s, "
+        "require_any_grad);\n";
+    for (auto& inplace_pair : inplace_map) {
+      std::string inplace_name = inplace_pair.second;
+      check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE,
+                                                   inplace_name, inplace_name);
+    }
+    VLOG(6) << "Check Inplace Input";
+  }
+
   std::string prepare_autograd_meta_str = "";
-  prepare_autograd_meta_str += get_autograd_meta_str;
+  // only generate input autograd_meta in temporary.
+  // output autograd_meta will be generated after running TraceOP.
+  prepare_autograd_meta_str += get_input_autograd_meta_str;
   prepare_autograd_meta_str += "\n";
 
   // [GradOpNode] GetTraceBackward
@@ -1060,7 +1107,7 @@ static std::string GenerateGradNodeCreationContent(
   size_t bwd_in_slot_num = out_vars.size();
   size_t bwd_out_slot_num = in_vars.size();
   const char* GRAD_OP_NODE_TEMPLATE =
-      "    auto grad_node = std::make_shared<GradNode%s>(%d, %d);\n";
+      "      auto grad_node = std::make_shared<GradNode%s>(%d, %d);\n";
   grad_node_creation_str += "    // Create GradOpNode\n";
   grad_node_creation_str += paddle::string::Sprintf(
       GRAD_OP_NODE_TEMPLATE, op_type, bwd_in_slot_num, bwd_out_slot_num);
@@ -1069,14 +1116,14 @@ static std::string GenerateGradNodeCreationContent(
   VLOG(6) << "Generated GradOpNode construction";
 
   // [GradOpNode] Set Attrs
-  grad_node_creation_str += "    // Set Attributes\n";
-  grad_node_creation_str += "    grad_node->SetAttrMap(std::move(attrs));\n";
+  grad_node_creation_str += "      // Set Attributes\n";
+  grad_node_creation_str += "      grad_node->SetAttrMap(std::move(attrs));\n";
   grad_node_creation_str +=
-      "    grad_node->SetDefaultAttrMap(std::move(default_attrs));\n";
+      "      grad_node->SetDefaultAttrMap(std::move(default_attrs));\n";
   grad_node_creation_str += "\n";
 
   // [GradOpNode] Set TensorWrappers
-  grad_node_creation_str += "    // Set Tensor Wrappers\n";
+  grad_node_creation_str += "      // Set Tensor Wrappers\n";
   for (const auto& iter : op_base_infos) {
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
         iter.GetGradInsFwdSlotnameMap();
@@ -1088,10 +1135,18 @@ static std::string GenerateGradNodeCreationContent(
         full_reserved = "true";
       }
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "    grad_node->SetTensorWrapper%s(%s, %s);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, tensor_wrapper_name,
-          full_reserved);
+          "      grad_node->SetTensorWrapper%s(%s, %s);\n";
+      // Replace output directly with input in inplace op.
+      if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
+        auto inplace_input_name = inplace_map[tensor_wrapper_name];
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
+            inplace_input_name, full_reserved);
+      } else {
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
+            tensor_wrapper_name, full_reserved);
+      }
     }
   }
   grad_node_creation_str += "\n";
@@ -1109,12 +1164,12 @@ static std::string GenerateGradNodeCreationContent(
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
-          "    grad_node->SetGradOutMeta(%s, %d);\n";
+          "      grad_node->SetGradOutMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position);
+          SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
 
       const char* ADD_EDGES_TEMPLATE =
-          "    if(%s) grad_node->AddEdges(%s, %d);\n";
+          "      if(%s) grad_node->AddEdges(%s, %d);\n";
       grad_node_creation_str +=
           paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name,
                                   input_autograd_name, input_position);
@@ -1123,11 +1178,11 @@ static std::string GenerateGradNodeCreationContent(
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
-          "    grad_node->SetGradOutMeta(&%s, %d);\n";
+          "      grad_node->SetGradOutMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position);
+          SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
 
-      const char* ADD_EDGES_TEMPLATE = "    grad_node->AddEdges(&%s, %d);\n";
+      const char* ADD_EDGES_TEMPLATE = "      grad_node->AddEdges(&%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
           ADD_EDGES_TEMPLATE, input_autograd_name, input_position);
     }
@@ -1139,73 +1194,125 @@ static std::string GenerateGradNodeCreationContent(
   std::string pass_stop_gradient_args = "false";
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
-    const std::string& output_autograd_name = "p_autograd_" + output_name;
-    size_t output_position = fwd_outputs_name_pos_map.at(output_name);
-
-    // Intermediate Tensor does not require SetHistory, nor RetainGrad
-
-    if (output.duplicable()) {
-      pass_stop_gradient_args += ", &" + output_autograd_name;
+    // Replace output directly with input in inplace op.
+    if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      auto inplace_input_name = inplace_map[output_name];
+      const std::string& inplace_input_autograd_name =
+          "p_autograd_" + inplace_input_name;
+      size_t output_position = fwd_outputs_name_pos_map.at(output_name);
+
+      // Intermediate Tensor does not require SetHistory, nor RetainGrad
+      pass_stop_gradient_args += ", " + inplace_input_autograd_name;
       const char* SET_OUT_RANK_TEMPLATE =
-          "    egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n";
+          "      egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+          SET_OUT_RANK_TEMPLATE, inplace_input_autograd_name, output_position);
 
       // Intermediate Tensor does not require SetHistory
       if (!output.intermediate()) {
         const char* SET_HISTORY_TEMPLATE =
-            "    egr::EagerUtils::SetHistory(&%s, grad_node);\n";
-        grad_node_creation_str +=
-            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+            "      egr::EagerUtils::SetHistory(%s, grad_node);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_HISTORY_TEMPLATE, inplace_input_autograd_name);
       }
       const char* SET_GRAD_IN_META_TEMPLATE =
-          "    grad_node->SetGradInMeta(&%s, %d);\n";
+          "      grad_node->SetGradInMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position);
+          SET_GRAD_IN_META_TEMPLATE, inplace_input_name, output_position);
 
+      // Intermediate Tensor does not require CheckAndRetainGrad
+      if (!output.intermediate()) {
+        VLOG(6) << "Generated Call RetainGradForTensor";
+        const char* RETAIN_GRAD_TEMPLATE =
+            "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
+        grad_node_creation_str +=
+            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, inplace_input_name);
+      }
     } else {
-      pass_stop_gradient_args += ", " + output_autograd_name;
-      const char* SET_OUT_RANK_TEMPLATE =
-          "    egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+      const std::string& output_autograd_name = "p_autograd_" + output_name;
+      size_t output_position = fwd_outputs_name_pos_map.at(output_name);
 
-      // Intermediate Tensor does not require SetHistory
+      // Intermediate Tensor does not require SetHistory, nor RetainGrad
+
+      if (output.duplicable()) {
+        pass_stop_gradient_args += ", &" + output_autograd_name;
+        const char* SET_OUT_RANK_TEMPLATE =
+            "      egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+
+        // Intermediate Tensor does not require SetHistory
+        if (!output.intermediate()) {
+          const char* SET_HISTORY_TEMPLATE =
+              "      egr::EagerUtils::SetHistory(&%s, grad_node);\n";
+          grad_node_creation_str += paddle::string::Sprintf(
+              SET_HISTORY_TEMPLATE, output_autograd_name);
+        }
+        const char* SET_GRAD_IN_META_TEMPLATE =
+            "      grad_node->SetGradInMeta(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+
+      } else {
+        pass_stop_gradient_args += ", " + output_autograd_name;
+        const char* SET_OUT_RANK_TEMPLATE =
+            "      egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+
+        // Intermediate Tensor does not require SetHistory
+        if (!output.intermediate()) {
+          const char* SET_HISTORY_TEMPLATE =
+              "      egr::EagerUtils::SetHistory(%s, grad_node);\n";
+          grad_node_creation_str += paddle::string::Sprintf(
+              SET_HISTORY_TEMPLATE, output_autograd_name);
+        }
+        const char* SET_GRAD_IN_META_TEMPLATE =
+            "      grad_node->SetGradInMeta(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+      }
+
+      // Intermediate Tensor does not require CheckAndRetainGrad
       if (!output.intermediate()) {
-        const char* SET_HISTORY_TEMPLATE =
-            "    egr::EagerUtils::SetHistory(%s, grad_node);\n";
+        VLOG(6) << "Generated Call RetainGradForTensor";
+        const char* RETAIN_GRAD_TEMPLATE =
+            "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
         grad_node_creation_str +=
-            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
       }
-      const char* SET_GRAD_IN_META_TEMPLATE =
-          "    grad_node->SetGradInMeta(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position);
-    }
-
-    // Intermediate Tensor does not require CheckAndRetainGrad
-    if (!output.intermediate()) {
-      VLOG(6) << "Generated Call RetainGradForTensor";
-      const char* RETAIN_GRAD_TEMPLATE =
-          "    egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
     }
   }
   VLOG(6) << "Generated SetGradIn/OutMeta";
 
   // [Generation] GradNode Creation
+  // After getting require_any_grad, firstly use CheckInplace method for inplace
+  // op.
+  // Then execute TraceOp and generate output autograd_meta.
+  // Finally, Construct GradNode. (Replace output directly with input in inplace
+  // op.)
+  // Add event record
+  std::string event_name = op_type + " node_creation";
   const char* GRAD_NODE_CREATION_TEMPLATE =
-      "  %s"
+      "%s"
       "  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
-      "  if(require_any_grad) {\n"
-      "    VLOG(6) << \" Construct Grad for %s \"; \n"
-      "    egr::EagerUtils::PassStopGradient(%s);\n"
-      "%s\n  }";
+      "%s\n"
+      "%s"
+      "  {\n"
+      "    paddle::platform::RecordEvent node_creation_record_event(\"%s\", "
+      "paddle::platform::TracerEventType::Operator, 1);\n"
+      "%s"
+      "    if(require_any_grad) {\n"
+      "      VLOG(6) << \" Construct Grad for %s \"; \n"
+      "      egr::EagerUtils::PassStopGradient(%s);\n"
+      "  %s\n"
+      "    }\n"
+      "  }";
   std::string grad_node_creation_body_str = paddle::string::Sprintf(
       GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
-      compute_require_grad_args, op_type, pass_stop_gradient_args,
-      grad_node_creation_str);
+      compute_require_grad_args, check_inplace_str, trace_op_body_str,
+      event_name, get_output_autograd_meta_str, op_type,
+      pass_stop_gradient_args, grad_node_creation_str);
 
   return grad_node_creation_body_str;
 }
@@ -1215,7 +1322,8 @@ static std::string GenerateGradNodeCreationContent(
 /* -------------------------------- */
 static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     const ForwardGenerationInfo& fwd_info,
-    const GradNodeGenerationInfo& bwd_info) {
+    const GradNodeGenerationInfo& bwd_info,
+    std::map<std::string, std::string> inplace_map = {}) {
   /* --- Process Forward Info ---*/
   const std::string& op_type = fwd_info.GetOpType();
   const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map =
@@ -1295,8 +1403,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
       core_ops_args_type_info[op_type][input_position] = "list";
     } else {
-      const char* FWD_INS_ARG_TEMPLATE =
-          "const paddle::experimental::Tensor& %s";
+      // inplace tensor can't be const
+      const char* FWD_INS_ARG_TEMPLATE;
+      bool flag_find_input_name = false;
+      if (!inplace_map.empty()) {
+        for (auto& inplace_pair : inplace_map) {
+          if (inplace_pair.second == input_name) {
+            flag_find_input_name = true;
+            FWD_INS_ARG_TEMPLATE = "paddle::experimental::Tensor& %s";
+            break;
+          }
+        }
+      }
+      if (!flag_find_input_name) {
+        FWD_INS_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s";
+      }
       input_args_str_list[input_position] =
           paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
 
@@ -1356,6 +1477,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] Get Outs Map
   std::string outs_contents_str = "";
+  std::string inplace_mapping_str = "";
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     std::string outnum = "1";
@@ -1398,6 +1520,22 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       }
       core_ops_args_info[op_type].push_back(output_var_name);
 
+    } else if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      // In inplace op, replace the output with the input directly.
+      PADDLE_ENFORCE_NE(
+          inplace_map[output_name], "",
+          paddle::platform::errors::InvalidArgument(
+              "Inplace op %s has no input corresponding to output %s.", op_type,
+              output_name));
+      const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
+      auto inplace_input_name = inplace_map[output_name];
+      outs_contents_str += paddle::string::Sprintf(
+          FWD_OUTS_CONTENT_TEMPLATE, output_name, inplace_input_name);
+
+      // inplace_map used in TraceOp.
+      const char* INPLACE_MAPPING_TEMPLATE = R"({"%s", "%s"},)";
+      inplace_mapping_str += paddle::string::Sprintf(
+          INPLACE_MAPPING_TEMPLATE, inplace_input_name, output_name);
     } else {
       if (output.duplicable()) {
         outnum = output_name + "Num";
@@ -1424,6 +1562,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   }
   if (outs_contents_str.size() > 0)
     outs_contents_str.pop_back();  // Remove trailing ","
+  if (inplace_mapping_str.size() > 0)
+    inplace_mapping_str.pop_back();  // Remove trailing ","
 
   const char* FWD_OUTS_MAP_TEMPLATE =
       "  std::map<std::string, "
@@ -1457,6 +1597,12 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   dygraph_function_args_str +=
       ", const paddle::framework::AttributeMap& attr_map";
 
+  /* --------- Generate TraceOp ----- */
+  // TraceOp should be run after compute require_any_grad. (for checking
+  // inplace)
+  // `trace_op_body_str` will be passed as a parameter to
+  // `GenerateGradNodeCreationContent`.
+  std::string trace_op_body_str = "";
   // [Generation] Get TraceOp
   const char* FWD_TRACE_OP_TEMPLATE =
       "  paddle::framework::AttributeMap attrs = attr_map;\n"
@@ -1464,11 +1610,12 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       "  egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", ins, "
       "outs, attrs, \n"
       "     egr::Controller::Instance().GetExpectedPlace(),\n"
-      "     &default_attrs, true, {});\n";
-  std::string trace_op_str =
-      paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_type);
-  generated_function_body += trace_op_str;
-  generated_function_body += "\n";
+      "     &default_attrs, true, {%s});\n";
+  std::string trace_op_str = paddle::string::Sprintf(
+      FWD_TRACE_OP_TEMPLATE, op_type, inplace_mapping_str);
+
+  trace_op_body_str += trace_op_str;
+  trace_op_body_str += "\n";
 
   VLOG(6) << "Generated AttrMap & TraceOp";
 
@@ -1533,34 +1680,64 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               output_varname, output_var_args_name);
         }
       } else {
-        const char* FWD_OUT_TENSOR_TEMPLATE =
-            "  paddle::experimental::Tensor %s;\n"
-            "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
-        out_tensor_str =
-            paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
-                                    output_name, output_varname);
+        if (!inplace_map.empty() && inplace_map.count(output_name)) {
+          // Modify meta info of inplace tensor.
+          // Bump inplace version of inplace tensor.
+          auto inplace_input_name = inplace_map[output_name];
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  egr::EagerUtils::ModifyInplaceInput(outs[\"%s\"][0], &%s);\n"
+              "  %s.bump_inplace_version();\n"
+              "  VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace "
+              "Strategy.\";\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name,
+              inplace_input_name, inplace_input_name);
+        } else {
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  paddle::experimental::Tensor %s;\n"
+              "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
+          out_tensor_str =
+              paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
+                                      output_name, output_varname);
+        }
       }
       return_types[return_position] = "paddle::experimental::Tensor";
     }
 
-    return_contents[return_position] = output_varname;
-    generated_function_body += out_tensor_str;
+    if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      // Replace output directly with input in inplace op.
+      return_contents[return_position] = inplace_map[output_name];
+    } else {
+      return_contents[return_position] = output_varname;
+    }
+    trace_op_body_str += out_tensor_str;
   }
-  generated_function_body += "\n";
+  trace_op_body_str += "\n";
   VLOG(6) << "Converted Output VarBase to EagerVariable(s)";
+  /* ------ END Generate TraceOp ----- */
 
   // [Generation] Handle core_ops_returns_info
-  core_ops_returns_info[op_type] = return_contents;
+  // avoid inplace op changing core_ops_returns_info
+  if (core_ops_returns_info.empty() || !core_ops_returns_info.count(op_type)) {
+    core_ops_returns_info[op_type] = return_contents;
+  }
 
   // [Generation] ComputeRequireGrad -> GradNodeCreation
+
   if (!bwd_info.GenerateForwardOnly()) {
-    std::string grad_node_creation_body_str =
-        GenerateGradNodeCreationContent(fwd_info, bwd_info);
+    // If GradNode needs to be generated, pass `trace_op_body_str`
+    // into `GenerateGradNodeCreationContent`.
+    std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
+        fwd_info, bwd_info, trace_op_body_str, inplace_map);
+
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
 
     // [Generation] Call RetainGradForTensor
     VLOG(6) << "Generated GradNode Creation codes";
+  } else {
+    // If GradNode doesn't need to be generated, generate TraceOP directly.
+    generated_function_body += trace_op_body_str;
   }
 
   // [Generation] Handle return: Tuple/Vector/Tensor
@@ -1607,17 +1784,33 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   VLOG(6) << "Generated return codes";
 
   // [Generation] Get Full Function
-  std::string function_name = op_type + "_dygraph_function";
+  std::string function_name;
+  if (inplace_map.empty()) {
+    function_name = op_type + "_dygraph_function";
+  } else {
+    // change function_name for inplace op.
+    function_name = op_type + "__dygraph_function";
+  }
 
   if (dygraph_function_args_str.size() > 0) {
     auto iter = dygraph_function_args_str.begin();
     if ((*iter) == ',') dygraph_function_args_str.erase(iter);
   }
 
-  const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n";
+  const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE =
+      "  paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", "
+      "paddle::platform::TracerEventType::Operator, 1);";
+  std::string event_name = op_type + " dygraph";
+  std::string fwd_record_event_str = paddle::string::Sprintf(
+      DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name);
+  const char* FWD_FUNCTION_TEMPLATE =
+      "%s %s(%s) {\n\n"
+      "%s\n"
+      "%s\n"
+      "}\n\n";
   std::string fwd_function_str = paddle::string::Sprintf(
       FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name,
-      dygraph_function_args_str, generated_function_body);
+      dygraph_function_args_str, fwd_record_event_str, generated_function_body);
 
   // [Generation] Generate forward functions header
   const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n";
@@ -1804,7 +1997,7 @@ static std::string GenerateSingleOpBase(
             !is_op_base_per_duplicable_input) {
           const char* GRAD_OUTS_CONTENT_TEMPLATE =
               "{ \"%s\", egr::EagerUtils::CreateVars( "
-              "this->OutputMeta()[%d].Size() ) },";
+              "this->OutputMeta()[%d].size() ) },";
           outs_contents_str += paddle::string::Sprintf(
               GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
         } else {
@@ -1842,18 +2035,17 @@ static std::string GenerateSingleOpBase(
   const char* ATTRS_TEMPLATE = "  auto& %s = this->attr_map_;\n";
   std::string grad_attrs_str =
       paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name);
-  for (const auto& iter : grad_attrs) {
-    if (IgnoreGradAttribute(fwd_op_type, iter.first)) continue;
-    std::pair<std::string, std::string> type_val =
-        GetAttrType(iter.second, false /*is_arg*/);
-    const char* GRAD_ATTRS_TEMPLATE =
-        "  %s %s = %s;\n"
-        "  %s[\"%s\"] = %s;\n";
-    std::string var_name = iter.first + std::to_string(*outs_size);
-    grad_attrs_str += paddle::string::Sprintf(
-        GRAD_ATTRS_TEMPLATE, type_val.first, var_name, type_val.second,
-        attrs_name, iter.first, var_name);
-  }
+  if (fwd_op_type == "cast") {
+    // swtich in out dtype
+    const char* CAST_GRAD =
+        "  auto temp_type = %s[\"in_dtype\"];\n"
+        "  %s[\"in_dtype\"] = %s[\"out_dtype\"];\n"
+        "  %s[\"out_dtype\"] = temp_type;\n";
+    grad_attrs_str += paddle::string::Sprintf(CAST_GRAD, attrs_name, attrs_name,
+                                              attrs_name, attrs_name);
+  }
+  // Handle dynamic grad attributes
+  grad_attrs_str += HandleDynamicGradAttributes(fwd_op_type, attrs_name);
   generated_grad_function_body += grad_attrs_str;
 
   const char* TRACE_OP_TEMPLATE =
@@ -2032,7 +2224,7 @@ static std::string GenerateGradNodeCCContents(
 
   if (is_op_base_per_duplicable_input) {
     const char* OP_BASE_PER_DUP_INPUT_TEMPLATE =
-        "  for(int i = 0; i < this->OutputMeta()[0].Size(); i++) {\n"
+        "  for(size_t i = 0; i < this->OutputMeta()[0].size(); i++) {\n"
         "    %s\n"
         "  }\n";
     generated_grad_function_body = paddle::string::Sprintf(
@@ -2044,6 +2236,8 @@ static std::string GenerateGradNodeCCContents(
       "GradNode%s::ApplyGradientHooks(grads);\n"
       "  std::vector<std::vector<paddle::experimental::Tensor>> outputs(%d);\n"
       "  %s\n"
+      "  if(NeedComplexToRealConversion()) "
+      "HandleComplexGradToRealGrad(&outputs);\n"
       "  return outputs;\n";
   generated_grad_function_body =
       paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(),
@@ -2052,10 +2246,21 @@ static std::string GenerateGradNodeCCContents(
   // [Generation] Get Full Grad Function
   const char* GRAD_FUNCTION_TEMPLATE =
       "std::vector<std::vector<paddle::experimental::Tensor>> "
-      "GradNode%s::operator()(const "
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads) {\n%s\n}";
-  std::string grad_function_str = paddle::string::Sprintf(
-      GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
+      "GradNode%s::operator()("
+      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
+      "create_graph) {\n"
+      "%s"
+      "%s"
+      "\n}";
+  std::string fill_zero_str = "";
+  if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
+    fill_zero_str =
+        "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, "
+        "this->InputMeta());\n";
+  }
+  std::string grad_function_str =
+      paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE, fwd_op_type,
+                              fill_zero_str, generated_grad_function_body);
 
   VLOG(6) << "Generated returns";
 
@@ -2087,19 +2292,29 @@ static std::string GenerateGradNodeHeaderContents(
       "  ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
       "\n"
       "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
-      "operator()(const "
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
+      "operator()("
+      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
+      "create_graph = false) "
       "override;\n"
       "\n"
+      "  void ClearTensorWrappers() override { \n"
+      "%s\n"
+      "    is_tensor_wrappers_cleared = true;\n"
+      "  }\n"
       "  std::string name() override { return \" GradNode%s \"; } \n "
       "\n"
       "  // SetX, SetY, ...\n"
       "%s\n"
       "  // SetAttrMap\n"
       "%s\n"
+      "  bool IsTensorWrappersCleared() override { \n"
+      "    return is_tensor_wrappers_cleared;\n"
+      "  }\n"
       " private:\n"
       "   // TensorWrappers\n"
       "%s\n"
+      "   bool is_tensor_wrappers_cleared = false;\n"
+      "\n"
       "   // Attribute Map\n"
       "%s\n"
       "};";
@@ -2133,6 +2348,7 @@ static std::string GenerateGradNodeHeaderContents(
 
   std::string set_tensor_wrappers_str = "";
   std::string tensor_wrapper_members_str = "";
+  std::string clear_tensor_wrappers_str = "";
   for (const auto& iter : op_base_infos) {
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
         iter.GetGradInsFwdSlotnameMap();
@@ -2164,6 +2380,13 @@ static std::string GenerateGradNodeHeaderContents(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
             struct_tensor_wrapper_name);
 
+        const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
+            "for (auto tw: %s)   {\n"
+            "       tw.clear();\n"
+            "     }\n";
+        clear_tensor_wrappers_str += paddle::string::Sprintf(
+            CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
+
       } else {
         const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
             "const paddle::experimental::Tensor& %s";
@@ -2176,10 +2399,14 @@ static std::string GenerateGradNodeHeaderContents(
             TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
-            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);";
+            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n";
         tensor_wrapper_body_str = paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
             tensor_wrapper_name, full_reserved_str);
+
+        const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = "   %s.clear();\n";
+        clear_tensor_wrappers_str += paddle::string::Sprintf(
+            CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
       }
       std::string full_reserved_signature_str = "bool full_reserved";
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
@@ -2194,8 +2421,8 @@ static std::string GenerateGradNodeHeaderContents(
 
   std::string grad_node_str = paddle::string::Sprintf(
       GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
-      op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
-      tensor_wrapper_members_str, attr_members_str);
+      op_type, clear_tensor_wrappers_str, op_type, set_tensor_wrappers_str,
+      set_attr_map_str, tensor_wrapper_members_str, attr_members_str);
 
   return grad_node_str;
 }
@@ -2240,8 +2467,9 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
       "\"paddle/fluid/eager/api/generated/fluid_generated/"
       "dygraph_forward_api.h\"\n"
       "#include "
-      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n"
-      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n";
+      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n"
+      "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
+      "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n";
   std::string forward_cc_include_str =
       paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
   std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);
@@ -2379,7 +2607,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     /* --------------------------- */
     VLOG(6) << "-------- GenerateForwardFunctionContents -------";
     std::pair<std::string, std::string> body_and_declaration =
-        GenerateForwardFunctionContents(fwd_info, bwd_info);
+        GenerateForwardFunctionContents(fwd_info, bwd_info, {});
 
     fwd_function_str += body_and_declaration.first + "\n";
 
@@ -2387,6 +2615,30 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     std::string fwd_function_declare_str = body_and_declaration.second;
     dygraph_forward_api_str += fwd_function_declare_str;
 
+    auto& infer_inplace =
+        paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
+    std::map<std::string, std::string> inplace_map;
+    // Inplace Function Generator.
+    // `sum` op has duplicate input. Don't consider adding inplace strategy
+    // for `sum` in temporary.
+    if (op_type != "sum" && infer_inplace) {
+      auto in_to_outs = infer_inplace(true);
+      for (auto& inplace_pair : in_to_outs) {
+        inplace_map[inplace_pair.second] = inplace_pair.first;
+      }
+
+      VLOG(6) << "-------- GenerateInplaceForwardFunctionContents -------";
+      std::pair<std::string, std::string> inplace_body_and_declaration =
+          GenerateForwardFunctionContents(fwd_info, bwd_info, inplace_map);
+
+      fwd_function_str += inplace_body_and_declaration.first + "\n";
+
+      VLOG(6) << "-------- GenerateInplaceDygraphForwardAPIContents -------";
+      std::string inplace_fwd_function_declare_str =
+          inplace_body_and_declaration.second;
+      dygraph_forward_api_str += inplace_fwd_function_declare_str;
+    }
+
     if (bwd_info.GenerateForwardOnly()) continue;
 
     VLOG(6) << "-------- GenerateGradNodeHeaderContents -------";
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index 53af6c1048d24..771351dd4affb 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -27,6 +27,7 @@ add_custom_target(eager_final_state_codegen
 
 set(tmp_python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h")
 set(python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h")
+
 add_custom_target(eager_final_state_python_c_codegen
     COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py" 
             "--api_yaml_path=${api_yaml_path}"
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 537c2bb7f02be..1d18cbe782948 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -17,6 +17,8 @@
 import argparse
 import os
 
+ops_to_fill_zero_for_empty_grads = set(list("split"))
+
 # For API dispatch used at python-level
 # { op_name : [arg_name, ...] }
 core_ops_returns_info = {}
@@ -28,7 +30,8 @@
 yaml_types_mapping = {
     'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t',  'size_t' : 'size_t', \
     'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
-    'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
+    'str' : 'std::string', \
+    'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
     'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
     'Tensor[]' : 'std::vector<Tensor>',
@@ -55,6 +58,14 @@ def ParseArguments():
 #################
 ###  Helpers  ###
 #################
+def RecoverBaseNameOfInplaceFunction(function_name):
+    return function_name[:-1]
+
+
+def GetInplacedFunctionName(function_name):
+    return function_name + "_"
+
+
 def FindGradName(string):
     return string + "_grad"
 
@@ -148,6 +159,24 @@ def ReadBwdFile(filepath):
 ######################
 ###  Yaml Parsers  ###
 ######################
+def ParseInplaceInfo(string):
+    # string: "(x -> out0), (y -> out2)"
+    inplace_map = {}
+    for pair in string.split(","):
+        pair = pair.strip()
+        if pair.startswith("("):
+            pair = pair[1:]
+
+        if pair.endswith(")"):
+            pair = pair[:-1]
+
+        key = pair.split("->")[0].strip()
+        val = pair.split("->")[1].strip()
+        inplace_map[key] = val
+
+    return inplace_map
+
+
 def RemoveSpecialSymbolsInName(string):
     # Remove any name after '@'
     ret = string.split("@")[0]
@@ -212,7 +241,8 @@ def ParseYamlArgs(string):
         default_value = m.group(3).split("=")[1].strip() if len(
             m.group(3).split("=")) > 1 else None
 
-        assert arg_type in yaml_types_mapping.keys()
+        assert arg_type in yaml_types_mapping.keys(
+        ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping."
         arg_type = yaml_types_mapping[arg_type]
 
         arg_name = RemoveSpecialSymbolsInName(arg_name)
@@ -247,7 +277,8 @@ def ParseYamlReturns(string):
         else:
             ret_type = ret.strip()
 
-        assert ret_type in yaml_types_mapping.keys()
+        assert ret_type in yaml_types_mapping.keys(
+        ), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping."
         ret_type = yaml_types_mapping[ret_type]
 
         assert "Tensor" in ret_type
@@ -439,7 +470,7 @@ def SlotNameMatching(backward_inputs_list, backward_returns_list,
                     backward_input_type, False, backward_input_pos
                 ]
             else:
-                assert False
+                assert False, backward_input_name
 
     for backward_output in backward_returns_list:
         backward_output_name = backward_output[0]
@@ -448,7 +479,8 @@ def SlotNameMatching(backward_inputs_list, backward_returns_list,
 
         backward_fwd_name = FindForwardName(backward_output_name)
         assert backward_fwd_name is not None
-        assert backward_fwd_name in forward_inputs_position_map.keys()
+        assert backward_fwd_name in forward_inputs_position_map.keys(
+        ), backward_fwd_name
 
         matched_forward_input_type = forward_inputs_position_map[
             backward_fwd_name][0]
@@ -475,6 +507,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
     # SetTensorWrapper Methods & TensorWrapper Members
     set_tensor_wrapper_methods_str = ""
     tensor_wrapper_members_str = ""
+    clear_tensor_wrapper_str = ""
     for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
         if tname in no_need_buffer_set:
             no_need_buffer = "true"
@@ -496,6 +529,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
 """
             tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format(
                 tensor_wrapper_name)
+
+            CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
+   {}.clear();
+"""
+            clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
+                tensor_wrapper_name)
+
         else:
             assert IsVectorTensorType(ttype)
             SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
@@ -513,6 +553,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
 """
             tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format(
                 tensor_wrapper_name)
+
+            CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
+   for (auto tw: {}) {
+     tw.clear();
+   };
+"""
+            clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
+                tensor_wrapper_name)
+
     # End: SetTensorWrapper Methods & TensorWrapper Members
 
     # SetAttributes & Attribute Members
@@ -521,7 +570,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
     for aname, atype, default_val, _ in backward_attrs_list:
         saved_attr_name = GetSavedName(aname)
         SET_ATTR_METHOD_TEMPLATE = """
-   void SetAttribute{}({} {}) {{     
+   void SetAttribute{}({} {}) {{
      {} = {};
    }}
 """
@@ -552,25 +601,38 @@ class {} : public egr::GradNodeBase {{
   ~{}() override = default;
 
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
+  
   std::string name() override {{ return \" {} \"; }}
+  
+  void ClearTensorWrappers() override {{
+      {}
+    is_tensor_wrappers_cleared = true;
+  }}
+  
   // SetTensorWrapperX, SetTensorWrapperY, ...
   {}
   // SetAttributes
   {}
+
+  bool IsTensorWrappersCleared() override {{
+      return is_tensor_wrappers_cleared;  
+  }}
  private:
   // TensorWrappers
   {}
 
+  bool is_tensor_wrappers_cleared = false;
+
   // Attributes
   {}
 }};
 """
     node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
         grad_node_name, grad_node_name, grad_node_name, grad_node_name,
-        grad_node_name, set_tensor_wrapper_methods_str,
-        set_attribute_methods_str, tensor_wrapper_members_str,
-        attribute_members_str)
+        grad_node_name, clear_tensor_wrapper_str,
+        set_tensor_wrapper_methods_str, set_attribute_methods_str,
+        tensor_wrapper_members_str, attribute_members_str)
 
     return node_declaration_str
 
@@ -598,10 +660,11 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
     for _, (ttype, fwd_position,
             grad_api_position) in backward_grad_input_map.items():
         if IsPlainTensorType(ttype):
-            grad_api_args[grad_api_position] = f"grads[{fwd_position}][0]"
+            grad_api_args[
+                grad_api_position] = f"hooked_grads[{fwd_position}][0]"
         else:
             assert IsVectorTensorType(ttype)
-            grad_api_args[grad_api_position] = f"grads[{fwd_position}]"
+            grad_api_args[grad_api_position] = f"hooked_grads[{fwd_position}]"
 
     for name, _, _, grad_api_position in backward_attrs_list:
         saved_attribute_name = GetSavedName(name)
@@ -624,35 +687,45 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
         else:
             # Rearrange output order accordingly
             returns_str += f"returns[{fwd_position}] =  grad_api_returns[{grad_api_position}];\n"
+    returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
     returns_str += f"return returns;\n"
 
     grad_node_name = GetGradNodeName(fwd_api_name)
 
+    fill_zero_str = ""
+    if fwd_api_name in ops_to_fill_zero_for_empty_grads:
+        fill_zero_str = "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, this->InputMeta());\n"
+
     if len(namespace) > 0:
         grad_api_namespace = f"paddle::experimental::{namespace}"
     else:
         grad_api_namespace = f"paddle::experimental"
 
     FUNCTION_TEMPLATE = """
-std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
+std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
+    {}
+    auto hooked_grads = ApplyGradientHooks(grads);
+    
     // Call grad_api function
+    VLOG(3) << \"Final State Running: \" << \"{}\"; 
     auto grad_api_returns = {}::{}({});
     {}
 }}
   """
 
     node_definition_str = FUNCTION_TEMPLATE.format(
-        grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str,
-        returns_str)
+        grad_node_name, fill_zero_str, grad_node_name, grad_api_namespace,
+        bwd_api_name, grad_api_args_str, returns_str)
 
     return node_definition_str
 
 
 def GenerateNodeCreationCodes(
         fwd_api_name, bwd_api_name, forward_inputs_position_map,
-        forward_outputs_position_map, forward_attrs_list,
+        forward_outputs_position_map, forward_attrs_list, forward_call_str,
         backward_fwd_input_map, backward_grad_input_map,
-        backward_grad_output_map, backward_attrs_list, optional_inputs):
+        backward_grad_output_map, backward_attrs_list, optional_inputs,
+        inplace_map):
     # fwd_api_name = ""
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
@@ -689,19 +762,19 @@ def GenerateNodeCreationCodes(
         output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
         if num_fwd_outputs == 1:
             if IsPlainTensorType(rtype):
-                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);"
+                output_autograd_meta = f"        egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);"
             else:
                 assert IsVectorTensorType(rtype)
-                output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n"
-                output_autograd_meta += f"    std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                output_autograd_meta = f"        std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n"
+                output_autograd_meta += f"        std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
         else:
             # Tuple api_result
             if IsPlainTensorType(rtype):
-                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);"
+                output_autograd_meta = f"        egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));"
             else:
                 assert IsVectorTensorType(rtype)
-                output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n"
-                output_autograd_meta += f"    std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                output_autograd_meta = f"        std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n"
+                output_autograd_meta += f"        std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
 
         outputs_autograd_meta_list.append(output_autograd_meta)
         pass_stop_gradient_args_list.append(output_autograd_meta_name)
@@ -710,16 +783,41 @@ def GenerateNodeCreationCodes(
     outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
     pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list)
 
+    # Check Inplace
+    check_inplace_str = ""
+    bump_inplace_version_str = ""
+    for inplace_name in inplace_map.keys():
+        inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name)
+        check_inplace_str += f"""
+    // Check Inplace
+    egr::EagerUtils::CheckInplace({inplace_name}, {inplace_autograd_meta_name}, require_any_grad);\n
+"""
+
+        bump_inplace_version_str += f"""
+    // Bump Inplace Version
+    {inplace_name}.bump_inplace_version();
+    VLOG(3) << \"Tensor(\" << {inplace_name}.name() << \") uses Inplace Strategy.\";\n
+"""
+
     # Node Construction
     num_bwd_inputs = len(backward_grad_input_map.keys())
     num_bwd_outputs = len(backward_grad_output_map.keys())
-    grad_node_name = GetGradNodeName(fwd_api_name)
-    node_construction_str = f"        auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});"
+    grad_node_name = GetGradNodeName(
+        RecoverBaseNameOfInplaceFunction(
+            fwd_api_name)) if inplace_map else GetGradNodeName(fwd_api_name)
+    node_construction_str = f"            auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});"
 
     # SetAttributes
     set_attributes_list = []
-    for name, _, _, _ in backward_attrs_list:
-        set_attributes = f"        grad_node->SetAttribute{name}({name});"
+    forward_attrs_name_set = set()
+    for name, _, _, _ in forward_attrs_list:
+        forward_attrs_name_set.add(name)
+
+    for name, _, default_val_attr, _ in backward_attrs_list:
+        if name in forward_attrs_name_set:
+            set_attributes = f"        grad_node->SetAttribute{name}({name});"
+        else:
+            set_attributes = f"        grad_node->SetAttribute{name}({default_val_attr});"
         set_attributes_list.append(set_attributes)
     set_attributes_str = "\n".join(set_attributes_list)
 
@@ -730,19 +828,22 @@ def GenerateNodeCreationCodes(
 
         if is_fwd_input:
             if is_optional:
-                set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
+                set_tensor_wrappers = f"            if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
             else:
-                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
+                set_tensor_wrappers = f"            grad_node->SetTensorWrapper{name}({name}, true);"
         else:
-            if IsVectorTensorType(atype):
-                tw_name = f"api_result[{pos}]"
+            if num_fwd_outputs > 1:
+                # Aligned with forward output position
+                assert name in forward_outputs_position_map.keys()
+                fwd_output_pos = forward_outputs_position_map[name][1]
+                tw_name = f"std::get<{fwd_output_pos}>(api_result)"
             else:
                 tw_name = f"api_result"
 
             if is_optional:
-                set_tensor_wrappers = f"        if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
+                set_tensor_wrappers = f"            if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
             else:
-                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({tw_name}, false);"
+                set_tensor_wrappers = f"            grad_node->SetTensorWrapper{name}({tw_name}, false);"
         set_tensor_wrappers_list.append(set_tensor_wrappers)
     set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
 
@@ -751,8 +852,8 @@ def GenerateNodeCreationCodes(
     set_edges_list = []
     for name, (_, pos) in forward_inputs_position_map.items():
         input_autograd_meta_name = GetAutoGradMetaName(name)
-        set_grad_out_meta = f"        grad_node->SetGradOutMeta({input_autograd_meta_name}, {pos});"
-        set_edges = f"        grad_node->AddEdges({input_autograd_meta_name}, {pos});"
+        set_grad_out_meta = f"            grad_node->SetGradOutMeta({name}, {pos});"
+        set_edges = f"            grad_node->AddEdges({input_autograd_meta_name}, {pos});"
         set_grad_out_meta_list.append(set_grad_out_meta)
         set_edges_list.append(set_edges)
     set_grad_out_meta_str = "\n".join(set_grad_out_meta_list)
@@ -766,73 +867,83 @@ def GenerateNodeCreationCodes(
     num_outputs = len(forward_outputs_position_map.keys())
     for name, (_, pos) in forward_outputs_position_map.items():
         output_autograd_meta_name = GetAutoGradMetaName(name)
-        set_out_rank = f"        egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
-        set_history = f"        egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
-        set_grad_in_meta = f"        grad_node->SetGradInMeta({output_autograd_meta_name}, {pos});"
+        set_out_rank = f"            egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
+        set_history = f"            egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
+        if num_outputs == 1:
+            set_retain_grad = f"            egr::EagerUtils::CheckAndRetainGrad(api_result);"
+            set_grad_in_meta = f"            grad_node->SetGradInMeta(api_result, {pos});"
+        else:
+            set_retain_grad = f"            egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
+            set_grad_in_meta = f"            grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"
 
         set_out_rank_list.append(set_out_rank)
         set_history_list.append(set_history)
         set_grad_in_meta_list.append(set_grad_in_meta)
-
-        if num_outputs == 1:
-            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(api_result);"
-        else:
-            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(api_result[{pos}]);"
         set_retain_grad_list.append(set_retain_grad)
+
     set_out_rank_str = "\n".join(set_out_rank_list)
     set_history_str = "\n".join(set_history_list)
     set_grad_in_meta_str = "\n".join(set_grad_in_meta_list)
     set_retain_grad_str = "\n".join(set_retain_grad_list)
 
+    node_event_name = fwd_api_name + " node_creation"
+    NODE_CREATION_TEMPLATE = """
+        paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
+        """
+    node_creation_event_str = NODE_CREATION_TEMPLATE.format(node_event_name)
+
     NODE_CREATION_TEMPLATE = """
 
     // Get AutoGradMeta
-{}
 {}
     bool trace_backward = egr::Controller::Instance().HasGrad();
-
     bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({});
-    if(require_any_grad) {{
-        egr::EagerUtils::PassStopGradient({});
-        
-        // Node Construction
 {}
-
-        // SetAttributes
+    // Forward API Call
+    {}
 {}
-
-        // SetTensorWrappers
+    {{
 {}
-
-        // SetGradOutMeta & SetEdges
 {}
+        if(require_any_grad) {{
+            egr::EagerUtils::PassStopGradient({});
+            
+            // Node Construction
 {}
-
-        // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+            // SetAttributes
 {}
+            // SetTensorWrappers
 {}
+            // SetGradOutMeta & SetEdges
 {}
 {}
-
+            // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+{}
+{}
+{}
+{}
+        }}
     }}
 
 """
     node_creation_str = NODE_CREATION_TEMPLATE.format(
-        inputs_autograd_meta_str, outputs_autograd_meta_str,
-        compute_require_grad_args_str, pass_stop_gradient_args_str,
-        node_construction_str, set_attributes_str, set_tensor_wrappers_str,
-        set_grad_out_meta_str, set_edges_str, set_out_rank_str, set_history_str,
-        set_grad_in_meta_str, set_retain_grad_str)
+        inputs_autograd_meta_str, compute_require_grad_args_str,
+        check_inplace_str, forward_call_str, bump_inplace_version_str,
+        node_creation_event_str, outputs_autograd_meta_str,
+        pass_stop_gradient_args_str, node_construction_str, set_attributes_str,
+        set_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str,
+        set_out_rank_str, set_history_str, set_grad_in_meta_str,
+        set_retain_grad_str)
 
     return node_creation_str
 
 
-def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
-                              forward_inputs_position_map,
-                              forward_outputs_position_map, forward_attrs_list,
-                              backward_fwd_input_map, backward_grad_input_map,
-                              backward_grad_output_map, backward_attrs_list,
-                              optional_inputs, intermediate_outputs):
+def GenerateForwardDefinition(
+        fwd_api_name, bwd_api_name, forward_inputs_position_map,
+        forward_outputs_position_map, forward_attrs_list,
+        backward_fwd_input_map, backward_grad_input_map,
+        backward_grad_output_map, backward_attrs_list, optional_inputs,
+        intermediate_outputs, inplace_map):
     # fwd_api_name = ""
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
@@ -856,7 +967,10 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
             if is_optional:
                 arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
             else:
-                arg_str = f"const paddle::experimental::Tensor& {name}"
+                if inplace_map and name in inplace_map.keys():
+                    arg_str = f"paddle::experimental::Tensor& {name}"
+                else:
+                    arg_str = f"const paddle::experimental::Tensor& {name}"
         else:
             assert IsVectorTensorType(ttype)
             arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
@@ -900,7 +1014,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
             returns_list[0] = f"api_result"
         else:
             # Tuple api_result
-            returns_list[pos] = f"api_result[{pos}]"
+            returns_list[pos] = f"std::get<{pos}>(api_result)"
 
         if IsPlainTensorType(rtype):
             returns_type_list[pos] = "paddle::experimental::Tensor"
@@ -919,13 +1033,15 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
 
     node_creation_str = GenerateNodeCreationCodes(
         fwd_api_name, bwd_api_name, forward_inputs_position_map,
-        forward_outputs_position_map, forward_attrs_list,
+        forward_outputs_position_map, forward_attrs_list, forward_call_str,
         backward_fwd_input_map, backward_grad_input_map,
-        backward_grad_output_map, backward_attrs_list, optional_inputs)
+        backward_grad_output_map, backward_attrs_list, optional_inputs,
+        inplace_map)
+
+    dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"
 
     FORWARD_FUNCTION_TEMPLATE = """
 {} {}({}) {{
-    // Forward API Call
     {}
     
 {}
@@ -938,7 +1054,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     forward_function_name = GetForwardFunctionName(fwd_api_name)
     forward_function_str = FORWARD_FUNCTION_TEMPLATE.format(
         returns_type_str, forward_function_name, inputs_args_definition_str,
-        forward_call_str, node_creation_str, returns_str)
+        dygraph_event_str, node_creation_str, returns_str)
     forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});"
 
     return forward_function_str, forward_function_declaration_str
@@ -1038,7 +1154,7 @@ def GenerateNodeCCFile(filepath, node_definition_str):
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 #include "paddle/fluid/eager/to_static/run_program_op_node.h"
 
-#include "paddle/phi/api/include/sparse_api.h"
+#include "paddle/phi/api/backward/sparse_bw_api.h"
 """
     file_contents += node_definition_str
     with open(filepath, 'a') as f:
@@ -1065,6 +1181,8 @@ def GenerateForwardCCFile(filepath, forward_definition_str):
 
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+
 """
 
     file_contents += GenerateCoreOpInfoDefinition()
@@ -1138,8 +1256,12 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
             fwd_args_str = fwd_api['args']
             fwd_returns_str = fwd_api['output']
 
+            inplace_map = {}
+            if 'inplace' in fwd_api.keys():
+                inplace_map = ParseInplaceInfo(fwd_api['inplace'])
+
             bwd_api_name = fwd_api['backward']
-            assert bwd_api_name in grad_api_dict.keys()
+            assert bwd_api_name in grad_api_dict.keys(), bwd_api_name
             bwd_api = grad_api_dict[bwd_api_name]
 
             assert 'args' in bwd_api.keys()
@@ -1211,7 +1333,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
             print("Generated Backward Grad Output Map: ",
                   backward_grad_output_map)
 
-            # Backward Validation Check
+            # Backward Validation Check            
             BackwardValidationCheck(backward_fwd_input_map,
                                     backward_grad_input_map,
                                     backward_attrs_list)
@@ -1231,10 +1353,10 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
             # Node Definition Generation
             definition_declaration_pair = GenerateForwardDefinition(
                 fwd_api_name, bwd_api_name, forward_inputs_position_map,
-                forward_outputs_position_map, forward_attrs_list,
+                forward_outputs_position_map, orig_forward_attrs_list,
                 backward_fwd_input_map, backward_grad_input_map,
                 backward_grad_output_map, backward_attrs_list, optional_inputs,
-                intermediate_outputs)
+                intermediate_outputs, {})
             print("Generated Forward Definition: ", forward_definition_str)
             print("Generated Forward Declaration: ", forward_declaration_str)
             yaml_forward_definition_str += definition_declaration_pair[0]
@@ -1243,7 +1365,31 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
             # For python-level API dispatch
             CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map,
                                       forward_outputs_position_map,
-                                      forward_attrs_list)
+                                      orig_forward_attrs_list)
+
+            # Inplaced Version Dygraph Function Generation
+            if fwd_api_name != "sum" and "inplace" in fwd_api.keys():
+                fwd_api_name_inplaced = GetInplacedFunctionName(fwd_api_name)
+
+                # Node Definition Generation
+                definition_declaration_pair = GenerateForwardDefinition(
+                    fwd_api_name_inplaced, bwd_api_name,
+                    forward_inputs_position_map, forward_outputs_position_map,
+                    forward_attrs_list, backward_fwd_input_map,
+                    backward_grad_input_map, backward_grad_output_map,
+                    backward_attrs_list, optional_inputs, intermediate_outputs,
+                    inplace_map)
+                print("Generated Inplaced Forward Definition: ",
+                      forward_definition_str)
+                print("Generated Inplaced Forward Declaration: ",
+                      forward_declaration_str)
+                forward_definition_str += definition_declaration_pair[0]
+                forward_declaration_str += definition_declaration_pair[1]
+
+                # For python-level API dispatch
+                CollectCoreOpsInformation(
+                    fwd_api_name_inplaced, forward_inputs_position_map,
+                    forward_outputs_position_map, forward_attrs_list)
 
         if len(namespace) > 0:
             forward_definition_str += f"""namespace {namespace} {{
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index eee32a2c5057d..5a732212a5649 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -14,9 +14,18 @@
 
 import os
 import argparse
-from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+import logging
+from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap, GetInplacedFunctionName, ParseInplaceInfo
+
+###########################
+## Global Configurations ##
+###########################
+skipped_forward_api_names = set(["scale"])
+
+
+def SkipAPIGeneration(forward_api_name):
+    return (forward_api_name in skipped_forward_api_names)
 
-skipped_fwd_api_names = set(["scale"])
 
 atype_to_parsing_function = {
     "bool": "CastPyArg2Boolean",
@@ -24,7 +33,7 @@
     "long": "CastPyArg2Long",
     "int64_t": "CastPyArg2Long",
     "float": "CastPyArg2Float",
-    "string": "CastPyArg2String",
+    "std::string": "CastPyArg2String",
     "std::vector<bool>": "CastPyArg2Booleans",
     "std::vector<int>": "CastPyArg2Ints",
     "std::vector<long>": "CastPyArg2Longs",
@@ -34,69 +43,48 @@
     "std::vector<std::string>": "CastPyArg2Strings",
     "paddle::experimental::Scalar": "CastPyArg2Scalar",
     "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray",
-    "paddle::experimental::Backend": "CastPyArg2Backend",
+    "paddle::experimental::Place": "CastPyArg2Place",
     "paddle::experimental::DataType": "CastPyArg2DataType",
 }
 
 
-def ParseArguments():
-    parser = argparse.ArgumentParser(
-        description='Eager Code Generator Args Parser')
-    parser.add_argument('--api_yaml_path', type=str)
-    parser.add_argument('--output_path', type=str)
-
-    args = parser.parse_args()
-    return args
-
-
 def FindParsingFunctionFromAttributeType(atype):
     if atype not in atype_to_parsing_function.keys():
-        print(f"Unable to find {atype} in atype_to_parsing_function.")
-        assert False
+        assert False, f"Unable to find {atype} in atype_to_parsing_function."
 
     return atype_to_parsing_function[atype]
 
 
-def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
-                            forward_attrs_list, forward_outputs_position_map,
-                            optional_inputs, is_forward_only):
-    # forward_inputs_position_map = { "name" : [type, fwd_position] }
-    # forward_outputs_position_map = { "name" : [type, fwd_position] }
-    # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...]
-    # optional_inputs = [name0, ...]
-
-    # Get EagerTensor from args
-    # Get dygraph function call args
-    num_args = len(forward_inputs_position_map.keys()) + len(forward_attrs_list)
-    num_input_tensors = len(forward_inputs_position_map.keys())
-    dygraph_function_call_list = ["" for i in range(num_args)]
-    get_eager_tensor_str = ""
-    for name, (ttype, pos) in forward_inputs_position_map.items():
-        is_optional = (name in optional_inputs)
-        if IsVectorTensorType(ttype):
-            get_eager_tensor_str += f"    auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
-        else:
-            if is_optional:
-                get_eager_tensor_str += f"    auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
-            else:
-                get_eager_tensor_str += f"    auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n"
-        dygraph_function_call_list[pos] = f"{name}"
+##########################
+## Refactored Functions ##
+##########################
+PARSE_PYTHON_C_TENSORS_TEMPLATE = \
+"    auto {} = {}(\"{}\", \"{}\", args, {}, {});\n"
+
+
+PARSE_PYTHON_C_ARGS_TEMPLATE = \
+"""    PyObject* {}_obj = PyTuple_GET_ITEM(args, {});\n
+     {} {} = {}({}_obj, \"{}\", {});\n"""
+
 
-    parse_attributes_str = ""
-    # Get Attributes
-    for name, atype, _, pos in forward_attrs_list:
-        parsing_function = FindParsingFunctionFromAttributeType(atype)
-        key = f"{name}"
+RECORD_EVENT_TEMPLATE = \
+"    paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);"
 
-        parse_attributes_str += f"    PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n"
-        parse_attributes_str += f"    {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n"
 
-        dygraph_function_call_list[pos] = f"{name}"
-    dygraph_function_call_str = ",".join(dygraph_function_call_list)
+RETURN_INPLACE_PYOBJECT_TEMPLATE = \
+"""
+    ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_final_state_args_info, \"final_state_{}\", \"{}\");
+    ssize_t return_id = GetIdxFromCoreOpsInfoMap(core_ops_final_state_returns_info, \"final_state_{}\", \"{}\");
+    return ToPyObject(out, return_id, args, arg_id);
+"""
+
 
-    PYTHON_C_FUNCTION_TEMPLATE = """
+PYTHON_C_FUNCTION_TEMPLATE = \
+"""
 static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs)
 {{
+  {}
+
   PyThreadState *tstate = nullptr;
   try
   {{
@@ -114,7 +102,7 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
     
     PyEval_RestoreThread(tstate);
     tstate = nullptr;
-    return ToPyObject(out);
+{}
   }}
   catch(...) {{
     if (tstate) {{
@@ -126,26 +114,50 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map,
 }}
 
 """
-    namespace_str = ""
-    if len(namespace) > 0:
-        namespace_str = f"{namespace}::"
 
-    if is_forward_only:
-        fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name
-    else:
-        fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name)
 
-    python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
-        fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str,
-        fwd_function_name, dygraph_function_call_str)
+FUNCTION_NAME_TEMPLATE = \
+"{}{}{}"
 
-    python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n"
 
-    return python_c_function_str, python_c_function_reg_str
+PYTHON_C_FUNCTION_REG_TEMPLATE = \
+"{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}"
 
 
-def GenerateCoreOpsInfoMap():
-    result = """
+PYTHON_C_WRAPPER_TEMPLATE = \
+"""
+#pragma once
+
+#include  "pybind11/detail/common.h"
+#include  "paddle/phi/api/all.h"
+#include  "paddle/phi/api/lib/dygraph_api.h"
+#include  "paddle/phi/common/backend.h"
+#include  "paddle/phi/common/data_type.h"
+#include  "paddle/phi/common/scalar.h"
+#include  "paddle/phi/common/scalar_array.h"
+#include  "paddle/phi/api/include/sparse_api.h"
+#include  "paddle/fluid/pybind/op_function_common.h"
+#include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
+#include  "paddle/fluid/pybind/exception.h"
+#include  "paddle/fluid/platform/profiler/event_tracing.h"
+#include  <Python.h>
+
+namespace paddle {{
+namespace pybind {{
+
+{}
+
+static PyMethodDef EagerFinalStateMethods[] = {{
+    {}
+}};
+
+}} // namespace pybind
+}} // namespace paddle
+"""
+
+
+CORE_OPS_INFO = \
+"""
 static PyObject * eager_get_final_state_core_ops_args_info(PyObject *self) {
     PyThreadState *tstate = nullptr;
     try
@@ -190,9 +202,11 @@ def GenerateCoreOpsInfoMap():
       return nullptr;
     }
 }
-    """
+"""
+
 
-    core_ops_infos_registry = """
+CORE_OPS_INFO_REGISTRY = \
+"""
     {\"get_final_state_core_ops_args_info\",
     (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS,
     \"C++ interface function for eager_get_final_state_core_ops_args_info.\"},
@@ -205,7 +219,285 @@ def GenerateCoreOpsInfoMap():
     METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_returns_info.\"},
 """
 
-    return result, core_ops_infos_registry
+NAMESPACE_WRAPPER_TEMPLATE = \
+"""namespace {} {{
+    {}
+}}
+"""
+
+
+#######################
+## Generator Classes ##
+#######################
+class PythonCSingleFunctionGenerator:
+    def __init__(self, fwd_api_contents, namespace):
+        self.fwd_api_contents = fwd_api_contents
+        self.namespace = namespace
+
+        # Raw Contents
+        self.forward_api_name = ""
+        self.forward_args_str = ""
+        self.forward_returns_str = ""
+
+        # Raw Data
+        self.forward_attrs_list = None  #[ [attr_name, attr_type, default_value, orig_position], ...]
+        self.forward_inputs_list = None  #[ [arg_name, arg_type, orig_position], ...]
+        self.forward_returns_list = None  #[ [ret_name, ret_type, orig_position], ...]
+
+        # Processed Data
+        self.forward_inputs_position_map = None  #{ "name" : [type, fwd_position] }
+        self.forward_outputs_position_map = None  #{ "name" : [type, fwd_position] }
+
+        # Special Op Attributes
+        self.optional_inputs = []  #[name, ...]
+        self.is_forward_only = True
+
+        # Generated Results
+        self.python_c_function_str = ""
+        self.python_c_function_reg_str = ""
+
+    def CollectRawContents(self):
+        fwd_api_contents = self.fwd_api_contents
+
+        assert 'api' in fwd_api_contents.keys(
+        ), "Unable to find \"api\" in fwd_api_contents keys"
+        assert 'args' in fwd_api_contents.keys(
+        ), "Unable to find \"args\" in fwd_api_contents keys"
+        assert 'output' in fwd_api_contents.keys(
+        ), "Unable to find \"output\" in fwd_api_contents keys"
+
+        self.forward_api_name = fwd_api_contents['api']
+        self.forward_args_str = fwd_api_contents['args']
+        self.forward_returns_str = fwd_api_contents['output']
+
+    def CollectIsForwardOnly(self):
+        fwd_api_contents = self.fwd_api_contents
+        self.is_forward_only = False if 'backward' in fwd_api_contents.keys(
+        ) else True
+
+    def CollectOptionalInputs(self):
+        fwd_api_contents = self.fwd_api_contents
+        if 'optional' in fwd_api_contents.keys():
+            self.optional_inputs = ParseDispensable(fwd_api_contents[
+                'optional'])
+
+    def CollectForwardInOutAttr(self):
+        forward_args_str = self.forward_args_str
+        forward_returns_str = self.forward_returns_str
+
+        self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForward(
+            forward_args_str, forward_returns_str)
+
+    def CollectForwardPositionMap(self):
+        forward_inputs_list = self.forward_inputs_list
+        forward_returns_list = self.forward_returns_list
+
+        self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap(
+            forward_inputs_list, forward_returns_list)
+
+    def GeneratePythonCFunction(self, inplace_map):
+        namespace = self.namespace
+        forward_api_name = GetInplacedFunctionName(
+            self.forward_api_name) if inplace_map else self.forward_api_name
+        forward_attrs_list = self.forward_attrs_list
+        forward_inputs_position_map = self.forward_inputs_position_map
+        forward_outputs_position_map = self.forward_outputs_position_map
+        optional_inputs = self.optional_inputs
+        is_forward_only = self.is_forward_only
+
+        # Generate Python-C Tensors Parsing Logic
+        get_eager_tensor_str = ""
+        for name, (ttype, pos) in forward_inputs_position_map.items():
+            is_optional = (name in optional_inputs)
+            if IsVectorTensorType(ttype):
+                get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format(
+                    name, "GetTensorListFromArgs", forward_api_name, name, pos,
+                    "false")
+            else:
+                if is_optional:
+                    get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format(
+                        name, "GetOptionalTensorFromArgs", forward_api_name,
+                        name, pos, "true")
+                else:
+                    get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format(
+                        name, "GetTensorFromArgs", forward_api_name, name, pos,
+                        "false")
+
+        parse_attributes_str = ""
+
+        # Generate Python-C Attributes Parsing Logic
+        for name, atype, _, pos in forward_attrs_list:
+            parsing_function_name = FindParsingFunctionFromAttributeType(atype)
+            parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format(
+                name, pos, atype, name, parsing_function_name, name,
+                forward_api_name, pos)
+
+        # Generate Dygraph Function Call Logic
+        num_args = len(forward_inputs_position_map.keys()) + len(
+            forward_attrs_list)
+        dygraph_function_call_list = ["" for i in range(num_args)]
+        for name, (_, pos) in forward_inputs_position_map.items():
+            dygraph_function_call_list[pos] = f"{name}"
+        for name, _, _, pos in forward_attrs_list:
+            dygraph_function_call_list[pos] = f"{name}"
+        dygraph_function_call_str = ",".join(dygraph_function_call_list)
+
+        # Generate Python-C Function Definitions
+        if is_forward_only:
+            fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
+                "paddle::experimental::", namespace, forward_api_name)
+        else:
+            fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
+                "::", namespace, GetForwardFunctionName(forward_api_name))
+
+        if inplace_map:
+            assert len(
+                inplace_map
+            ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
+            for inplace_input, inplace_output in inplace_map.items():
+                return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
+                    forward_api_name, inplace_input, forward_api_name,
+                    inplace_output)
+                break
+        else:
+            return_str = "    return ToPyObject(out);"
+
+        # Generate Record Event for performance profiling
+        pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format(
+            "pythonc_record_event", forward_api_name, "pybind_imperative_func")
+        self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
+            forward_api_name, pythonc_record_event_str, forward_api_name,
+            get_eager_tensor_str, parse_attributes_str, fwd_function_name,
+            dygraph_function_call_str, return_str)
+
+        # Generate Python-C Function Registration
+        self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
+            forward_api_name, namespace, forward_api_name, forward_api_name)
+
+    def run(self, inplace_map):
+        # Initialized is_forward_only
+        self.CollectIsForwardOnly()
+
+        # Initialized forward_api_name, forward_args_str, forward_returns_str
+        self.CollectRawContents()
+        if SkipAPIGeneration(self.forward_api_name): return False
+
+        # Initialized optional_inputs
+        self.CollectOptionalInputs()
+
+        # Initialized forward_inputs_list, forward_returns_list, forward_attrs_list
+        self.CollectForwardInOutAttr()
+        logging.info(
+            f"Parsed Original Forward Inputs List: \n{self.forward_inputs_list}")
+        logging.info(
+            f"Prased Original Forward Attrs List: \n{self.forward_attrs_list}")
+        logging.info(
+            f"Parsed Original Forward Returns List: \n{self.forward_returns_list}"
+        )
+
+        # Initialized forward_inputs_position_map, forward_outputs_position_map
+        self.CollectForwardPositionMap()
+        logging.info(
+            f"Generated Forward Input Position Map: {self.forward_inputs_position_map}"
+        )
+        logging.info(
+            f"Generated Forward Output Position Map: {self.forward_outputs_position_map}"
+        )
+
+        # Code Generation
+        self.GeneratePythonCFunction(inplace_map)
+        logging.info(
+            f"Generated Python-C Function: {self.python_c_function_str}")
+        logging.info(
+            f"Generated Python-C Function Declaration: {self.python_c_function_reg_str}"
+        )
+
+        return True
+
+
+class PythonCYamlGenerator:
+    def __init__(self, path):
+        self.yaml_path = path
+
+        self.namespace = ""
+        self.forward_api_list = []
+
+        # Generated Result
+        self.python_c_functions_reg_str = ""
+        self.python_c_functions_str = ""
+
+    def ParseYamlContents(self):
+        yaml_path = self.yaml_path
+        self.forward_api_list = ReadFwdFile(yaml_path)
+
+    def GeneratePythonCFunctions(self):
+        namespace = self.namespace
+        forward_api_list = self.forward_api_list
+
+        for forward_api_content in forward_api_list:
+            f_generator = PythonCSingleFunctionGenerator(forward_api_content,
+                                                         namespace)
+            status = f_generator.run({})
+
+            if status == True:
+                self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n"
+                self.python_c_functions_str += f_generator.python_c_function_str + "\n"
+
+            if 'inplace' in forward_api_content.keys():
+                inplace_map = ParseInplaceInfo(forward_api_content['inplace'])
+
+                f_generator_inplace = PythonCSingleFunctionGenerator(
+                    forward_api_content, namespace)
+                status = f_generator_inplace.run(inplace_map)
+
+                if status == True:
+                    self.python_c_functions_reg_str += f_generator_inplace.python_c_function_reg_str + ",\n"
+                    self.python_c_functions_str += f_generator_inplace.python_c_function_str + "\n"
+
+    def InferNameSpace(self):
+        yaml_path = self.yaml_path
+        if "sparse" in yaml_path:
+            self.namespace = "sparse::"
+
+    def AttachNamespace(self):
+        namespace = self.namespace
+        python_c_functions_str = self.python_c_functions_str
+
+        if namespace != "":
+            if namespace.endswith("::"):
+                namespace = namespace[:-2]
+            self.python_c_functions_str = NAMESPACE_WRAPPER_TEMPLATE.format(
+                namespace, python_c_functions_str)
+
+    def run(self):
+        # Infer namespace from yaml_path
+        self.InferNameSpace()
+
+        # Read Yaml file
+        self.ParseYamlContents()
+
+        # Code Generation
+        self.GeneratePythonCFunctions()
+
+        # Wrap with namespace
+        self.AttachNamespace()
+
+
+############################
+## Code Generation Helper ##
+############################
+def ParseArguments():
+    parser = argparse.ArgumentParser(
+        description='Eager Code Generator Args Parser')
+    parser.add_argument('--api_yaml_path', type=str)
+    parser.add_argument('--output_path', type=str)
+
+    args = parser.parse_args()
+    return args
+
+
+def GenerateCoreOpsInfoMap():
+    return CORE_OPS_INFO, CORE_OPS_INFO_REGISTRY
 
 
 def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
@@ -217,35 +509,6 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str):
     python_c_function_reg_str += core_ops_infos_registry
     python_c_function_reg_str += "\n {nullptr,nullptr,0,nullptr}"
 
-    PYTHON_C_WRAPPER_TEMPLATE = """
-#pragma once
-
-#include  "pybind11/detail/common.h"
-#include  "paddle/phi/api/all.h"
-#include  "paddle/phi/api/lib/dygraph_api.h"
-#include  "paddle/phi/common/backend.h"
-#include  "paddle/phi/common/data_type.h"
-#include  "paddle/phi/common/scalar.h"
-#include  "paddle/phi/common/scalar_array.h"
-#include  "paddle/phi/api/include/sparse_api.h"
-#include  "paddle/fluid/pybind/op_function_common.h"
-#include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
-#include  "paddle/fluid/pybind/exception.h"
-#include  <Python.h>
-
-namespace paddle {{
-namespace pybind {{
-
-{}
-
-static PyMethodDef EagerFinalStateMethods[] = {{
-    {}
-}};
-
-}} // namespace pybind
-}} // namespace paddle
-
-"""
     python_c_str = PYTHON_C_WRAPPER_TEMPLATE.format(python_c_function_str,
                                                     python_c_function_reg_str)
 
@@ -259,86 +522,23 @@ def GeneratePythonCFile(filepath, python_c_str):
 
 if __name__ == "__main__":
     args = ParseArguments()
-
     api_yaml_paths = args.api_yaml_path.split(",")
 
-    python_c_functions_reg_str = ""
-    python_c_functions_str = ""
-
+    generated_python_c_functions = ""
+    generated_python_c_registration = ""
     for i in range(len(api_yaml_paths)):
         api_yaml_path = api_yaml_paths[i]
 
-        if "sparse" in api_yaml_path:
-            namespace = "sparse"
-        else:
-            namespace = ""
-
-        fwd_api_list = ReadFwdFile(api_yaml_path)
-
-        python_c_function_list = []
-        python_c_function_reg_list = []
-        for fwd_api in fwd_api_list:
-
-            # We only generate Ops with grad
-            is_forward_only = False
-            if 'backward' not in fwd_api.keys():
-                is_forward_only = True
-
-            assert 'api' in fwd_api.keys()
-            assert 'args' in fwd_api.keys()
-            assert 'output' in fwd_api.keys()
-
-            fwd_api_name = fwd_api['api']
-            fwd_args_str = fwd_api['args']
-            fwd_returns_str = fwd_api['output']
-
-            if fwd_api_name in skipped_fwd_api_names:
-                continue
-
-            # Parse Dispensable Inputs
-            optional_inputs = []
-            if 'optional' in fwd_api.keys():
-                optional_inputs = ParseDispensable(fwd_api['optional'])
-
-            # Collect Original Forward Inputs/Outputs and then perform validation checks
-            forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward(
-                fwd_args_str, fwd_returns_str)
-            print("Parsed Original Forward Inputs List: ", forward_inputs_list)
-            print("Prased Original Forward Attrs List: ", forward_attrs_list)
-            print("Parsed Original Forward Returns List: ",
-                  forward_returns_list)
-
-            forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap(
-                forward_inputs_list, forward_returns_list)
-            print("Generated Forward Input Position Map: ",
-                  forward_inputs_position_map)
-            print("Generated Forward Output Position Map: ",
-                  forward_outputs_position_map)
-
-            python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction(
-                fwd_api_name, forward_inputs_position_map, forward_attrs_list,
-                forward_outputs_position_map, optional_inputs, is_forward_only)
-            python_c_function_list.append(python_c_function_str)
-            python_c_function_reg_list.append(python_c_function_reg_str)
-            print("Generated Python-C Function: ", python_c_function_str)
-
-        # Append Namespace
-        python_c_functions_reg_str += ",\n".join(
-            python_c_function_reg_list) + ","
-        python_c_functions = "\n".join(python_c_function_list)
-        if len(namespace) > 0:
-            python_c_functions_str += f"""namespace {namespace} {{
-    {python_c_functions}
-}}
-"""
+        y_generator = PythonCYamlGenerator(api_yaml_path)
+        y_generator.run()
 
-        else:
-            python_c_functions_str += python_c_functions
+        generated_python_c_functions += y_generator.python_c_functions_str + "\n"
+        generated_python_c_registration += y_generator.python_c_functions_reg_str + "\n"
 
-    python_c_str = GeneratePythonCWrappers(python_c_functions_str,
-                                           python_c_functions_reg_str)
+    python_c_str = GeneratePythonCWrappers(generated_python_c_functions,
+                                           generated_python_c_registration)
 
-    print("Generated Python-C Codes: ", python_c_str)
+    logging.info(f"Generated Python-C Codes: \n{python_c_str}")
 
     output_path = args.output_path
     for path in [output_path]:
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 603f93d9ddc14..0e9dc19c2e310 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -19,6 +19,8 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/grad_tensor_holder.h"
 #include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
@@ -27,6 +29,325 @@
 
 namespace egr {
 
+/*
+* GeneralGrad is Helpper class to implement custom grad operation between
+* outputs and inputs.
+*
+* **/
+class GeneralGrad {
+ public:
+  static GeneralGrad& Instance() { return *general_grad_; }
+
+  // Get inputs's / no_grad_vars's GradNodes and InputMeta Info
+  void GetTargetNodesInfo(
+      const std::vector<paddle::experimental::Tensor>& inputs,
+      bool is_no_grad_vars) {
+    std::string msg = is_no_grad_vars ? "no_grad_vars" : "inputs";
+    VLOG(6) << "Running in GetTargetNodesInfo.";
+    if (!inputs.empty()) {
+      VLOG(6) << msg << " are not empty.";
+      size_t num_inputs = inputs.size();
+      for (size_t i = 0; i < num_inputs; i++) {
+        AutogradMeta* auto_grad_meta =
+            EagerUtils::unsafe_autograd_meta(inputs[i]);
+        auto target_node = auto_grad_meta->GetMutableGradNode().get();
+        PADDLE_ENFORCE_NOT_NULL(target_node,
+                                paddle::platform::errors::Fatal(
+                                    "There is no grad op for %s:[%d] or it's"
+                                    "stop_gradient=True.",
+                                    msg, i));
+        if (is_no_grad_vars) {
+          (no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta;
+        } else {  // normal input
+          (input_target_nodes_inputmeta_map)[target_node] = auto_grad_meta;
+        }
+      }
+    }
+  }
+
+  // Purify potential_startup_nodes, remove nodes those are the same as
+  // input_target_nodes
+  void PurifyPotentialStartUpNodes() {
+    VLOG(6) << "Running in PurifyPotentialStartUpNodes";
+    if (input_target_nodes_inputmeta_map.empty()) return;
+    std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
+    for (auto startup_op : potential_startup_nodes) {
+      auto iter = input_target_nodes_inputmeta_map.find(startup_op);
+      if (iter != input_target_nodes_inputmeta_map.end()) {
+        potential_startup_nodes_to_be_erased.emplace(iter->first);
+      }
+    }
+    if (!potential_startup_nodes_to_be_erased.empty()) {
+      for (auto nodes : potential_startup_nodes_to_be_erased) {
+        potential_startup_nodes.erase(nodes);
+      }
+    }
+  }
+
+  // Remove some nodes those doesn't need to be
+  // stored in potential_stop_nodes、potential_startup_nodes
+  void UpdateGraphInfo() {
+    // Updated potential_sotp_nodes by depending_nodes,
+    // make sure the path from root to target_node is ok
+    std::unordered_set<GradNodeBase*> _startup_ops;
+    VLOG(6) << "Running in UpdateGraphInfo";
+    std::queue<GradNodeBase*> queue;
+    for (auto& target_nodes_inputmeta_pair : input_target_nodes_inputmeta_map) {
+      queue.emplace(target_nodes_inputmeta_pair.first);
+    }
+
+    while (!queue.empty()) {
+      auto* target_node = queue.front();
+      queue.pop();
+      if (!(depending_nodes)[target_node].empty()) {
+        auto precedding_nodes = (depending_nodes)[target_node];
+        for (auto pre_nodes : precedding_nodes) {
+          queue.emplace(pre_nodes);
+          if (potential_stop_nodes.find(pre_nodes) !=
+              potential_stop_nodes.end()) {
+            potential_stop_nodes.erase(pre_nodes);
+          }
+        }
+      } else {  // startup_ops have no precedding nodes
+        VLOG(6) << "Emplace _startup_ops";
+        _startup_ops.emplace(target_node);
+      }
+    }
+    // Purify potential_startup_nodes again, remove some
+    // potential startup_nodes that unreach to input target nodes
+    if (!_startup_ops.empty()) {
+      std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
+      for (auto node : potential_startup_nodes) {
+        if (_startup_ops.count(node) == 0) {
+          VLOG(6) << "Set up potential_startup_nodes_to_be_erased";
+          potential_startup_nodes_to_be_erased.emplace(node);
+        }
+      }
+      if (!potential_startup_nodes_to_be_erased.empty()) {
+        for (auto node : potential_startup_nodes_to_be_erased) {
+          VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased";
+          potential_startup_nodes.erase(node);
+        }
+      }
+    }
+  }
+
+  // Get Graph Info Betweent input target GradNode and outputs，
+  // record depending_nodes、potential_stop_nodes、potential_startup_nodes
+  void GetGraphInfoBetweenTargets(const std::queue<GradNodeBase*>& init_queue) {
+    VLOG(6) << "Runing In GetGraphInfoBetweenTargets";
+
+    // Calculate in_degree for each node
+    std::unordered_map<GradNodeBase*, int> node_in_degree_map;
+
+    // Copy nodes
+    std::queue<GradNodeBase*> queue = init_queue;
+    std::unordered_set<GradNodeBase*> visited;
+
+    // Visit each node exactly once in any order
+    while (!queue.empty()) {
+      GradNodeBase* node = queue.front();
+      queue.pop();
+
+      if (visited.count(node)) {
+        continue;
+      }
+      visited.insert(node);
+
+      // Check node is target_nodes or not, if node is not target_node,
+      // all the next_node will be marked in potential_stop_nodes
+      bool is_potential_stop_nodes =
+          input_target_nodes_inputmeta_map.count(node);
+
+      // Find and append next nodes
+      const std::vector<std::vector<Edge>>& edges = node->GetEdges();
+      for (const auto& edge_list : edges) {
+        for (const Edge& edge : edge_list) {
+          GradNodeBase* next_node = edge.GetMutableGradNode().get();
+
+          // Next node could be nullptr if it is leaf tensor with no
+          // AccumulationNode attached
+          // Or it could also originated from dispensable inputs
+          if (!next_node) continue;
+
+          // if node not in input_target_nodes,
+          // all the next_nodes of current node will be inserted to
+          // potential_stop_node
+          if (is_potential_stop_nodes) {
+            potential_stop_nodes.emplace(next_node);
+          }
+
+          // Update in_degree
+          if (!node_in_degree_map.count(next_node))
+            node_in_degree_map[next_node] = 0;
+          node_in_degree_map[next_node]++;
+
+          // Record depending relationship
+          (depending_nodes)[next_node].emplace(node);
+          queue.push(next_node);
+        }
+      }
+    }
+    // Update Graph Info, remove some nodes in
+    // potential_stop_nodes、potential_startup_nodes、
+    UpdateGraphInfo();
+  }
+
+  void ModifyReadyQueue(std::queue<GradNodeBase*>* queue) {
+    std::queue<GradNodeBase*> tmp_queue;
+    for (auto nodes : potential_startup_nodes) {
+      tmp_queue.emplace(nodes);
+    }
+    tmp_queue.swap(*queue);
+  }
+
+  // Set result for input target grad_var when potential_startup_nodes is empty
+  void SetResultForInputTargetVar(
+      const std::unordered_map<GradNodeBase*,
+                               std::unique_ptr<GradTensorHolder>>&
+          node_input_buffers_dict) {
+    if (potential_startup_nodes.size() == 0) {
+      for (auto input_target_node : *GetInPutTargetNodesInputMetaMap()) {
+        // out rank_info of forward op
+        auto rank_info = input_target_node.second->OutRankInfo();
+        auto iter = node_input_buffers_dict.find(input_target_node.first);
+        if (iter != node_input_buffers_dict.end()) {
+          auto& target_result =
+              (iter->second)->Buffers()[rank_info.first][rank_info.second];
+          // save the target result
+          results_map[input_target_node.first] = target_result;
+        }
+      }
+    }
+  }
+
+  // Set input target grad_var from node_input_buffer by inputmeta
+  void SetResultForInputTargetVar(GradTensorHolder input_buffers,
+                                  GradNodeBase* node) {
+    auto iter = GetInPutTargetNodesInputMetaMap()->find(node);
+    if (iter != GetInPutTargetNodesInputMetaMap()->end()) {
+      VLOG(6) << "Get target result by by inputmeta";
+      // out rank_info of forward op
+      auto rank_info = (iter->second)->OutRankInfo();
+      // rank_info is a pair, first means slot_id, second means rank.
+      auto& target_result =
+          input_buffers.Buffers()[rank_info.first][rank_info.second];
+      // save the target result
+      results_map[node] = target_result;
+    }
+  }
+
+  std::vector<paddle::experimental::Tensor> GetResults(
+      const std::vector<paddle::experimental::Tensor>& inputs,
+      bool allow_unused, bool create_graph) {
+    VLOG(6) << "Running in GetResults";
+    if (inputs.empty()) return {};
+
+    std::vector<paddle::experimental::Tensor> results;
+    results.reserve(inputs.size());
+
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto& input = inputs[i];
+      AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input);
+      auto target_node = auto_grad_meta->GetMutableGradNode().get();
+
+      auto iter = results_map.find(target_node);
+      if (iter != results_map.end()) {
+        // set StopGradient = !create_graph
+        AutogradMeta* tensor_auto_grad_meta =
+            EagerUtils::autograd_meta(&(iter->second));
+        tensor_auto_grad_meta->SetStopGradient(!create_graph);
+        results.emplace_back(iter->second);
+      } else {
+        PADDLE_ENFORCE_EQ(allow_unused, true,
+                          paddle::platform::errors::InvalidArgument(
+                              "The %d-th input does not appear in the backward "
+                              "graph. Please check the input tensor or set "
+                              "allow_unused=True to get None result.",
+                              i));
+        results.emplace_back();
+      }
+    }
+    Clear();
+    return results;
+  }
+
+  void PreparedForGeneralGrad(
+      const std::vector<paddle::experimental::Tensor>& inputs,
+      const std::vector<paddle::experimental::Tensor>& no_grad_vars,
+      std::queue<GradNodeBase*>* queue,
+      const std::unordered_map<GradNodeBase*,
+                               std::unique_ptr<GradTensorHolder>>&
+          node_input_buffers_dict) {
+    // Get no_grad_vars's GradNodes and InputMeta Info
+    GetTargetNodesInfo(no_grad_vars, true /* is_no_grad_vars */);
+    // Get inputs's GradNodes and InputMeta Info
+    GetTargetNodesInfo(inputs, false /* is_no_grad_vars */);
+    // Purify potential_startup_ops, remove those nodes that are the same as
+    // input_target_nodes
+    PurifyPotentialStartUpNodes();
+    // Get Graph Info Betweent input target gradnode and outputs
+    // Record the depending_nodes and
+    // potential_stop_nodes、potential_startup_nodes
+    GetGraphInfoBetweenTargets(*queue);
+    // Reset queue. Queue is empty only when
+    // 1.input equals to output. 2.input can not reach to output.
+    ModifyReadyQueue(queue);
+    // Set result for input target grad_var when queue is empty
+    if (queue->empty()) SetResultForInputTargetVar(node_input_buffers_dict);
+  }
+
+  bool IsPotentialStopNodes(GradNodeBase* node) {
+    return potential_stop_nodes.count(node);
+  }
+
+  std::unordered_map<GradNodeBase*, AutogradMeta*>*
+  GetNoGradVarNodesInputMetaMap() {
+    return &no_grad_var_nodes_inputmeta_map;
+  }
+
+  std::unordered_map<GradNodeBase*, AutogradMeta*>*
+  GetInPutTargetNodesInputMetaMap() {
+    return &input_target_nodes_inputmeta_map;
+  }
+
+  std::unordered_set<GradNodeBase*>* GetPotentialStopNodes() {
+    return &potential_stop_nodes;
+  }
+
+  std::unordered_set<GradNodeBase*>* GetPotentialStartupNodes() {
+    return &potential_startup_nodes;
+  }
+
+  void Clear() {
+    no_grad_var_nodes_inputmeta_map.clear();
+    input_target_nodes_inputmeta_map.clear();
+    potential_startup_nodes.clear();
+    potential_stop_nodes.clear();
+    depending_nodes.clear();
+    results_map.clear();
+  }
+
+ private:
+  GeneralGrad() = default;
+  static GeneralGrad* general_grad_;
+  // no_grad_vars's GradNode and GradNode's InputMeta.
+  std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
+      no_grad_var_nodes_inputmeta_map;
+  // inputs's GradNode and GradNode's InputMeta.
+  std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
+      input_target_nodes_inputmeta_map;
+  // Record all the potential startup_nodes, will be changed.
+  std::unordered_set<GradNodeBase*> potential_startup_nodes;
+  // Record all the potential stop nodes, will be changed.
+  std::unordered_set<GradNodeBase*> potential_stop_nodes;
+  std::unordered_map<GradNodeBase* /* next node */,
+                     std::unordered_set<GradNodeBase*> /* pre nodes */>
+      depending_nodes;
+  std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;
+  DISABLE_COPY_AND_ASSIGN(GeneralGrad);
+};
+
 std::unordered_map<GradNodeBase*, int> getInDegreeMap(
     const std::queue<GradNodeBase*>& init_queue) {
   // Calculate in_degree for each node
@@ -74,14 +395,51 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
   return node_in_degree_map;
 }
 
-void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
-                 const std::vector<paddle::experimental::Tensor>& grad_tensors,
-                 bool retain_graph) {
+// Enforce GradNode has TensorWrappers as Input
+void EnforceGradNodeHasInput(GradNodeBase* node) {
+  VLOG(6) << "Running in EnforceGradNodeHasInput";
+  PADDLE_ENFORCE_NE(
+      node->IsTensorWrappersCleared(), true,
+      paddle::platform::errors::Fatal(
+          "The TensorWrappers of %s do not exist. This may be because:\n"
+          "You calculate backward twice for the same subgraph without "
+          "setting retain_graph=True. Please set retain_graph=True in the "
+          "first backward/grad call.\n",
+          node->name()));
+}
+
+void DuplicateCheck(const std::vector<paddle::experimental::Tensor>& inputs,
+                    bool is_input) {
+  std::unordered_set<AutogradMeta*> visisted_ins;
+  std::string msg = is_input ? "inputs" : "outputs";
+  for (auto in : inputs) {
+    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
+    PADDLE_ENFORCE_EQ(
+        visisted_ins.count(auto_grad_meta), 0,
+        paddle::platform::errors::AlreadyExists(
+            "%s contain duplicate tensor %s, please check %s carefully.", msg,
+            in.name(), msg));
+    visisted_ins.insert(auto_grad_meta);
+  }
+}
+
+GeneralGrad* GeneralGrad::general_grad_ = new GeneralGrad();
+
+std::vector<paddle::experimental::Tensor> RunBackward(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // output
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph, bool create_graph = false,
+    const std::vector<paddle::experimental::Tensor>& inputs = {},
+    bool allow_unused = false,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
   VLOG(6) << "Start Backward";
   // *Gradient Hook should happen at node-level
   // *Inplace version check should perform at node-level
   // *Cross-batch accumulation happens at forward pass
 
+  // GeneralGrad
+  bool is_general_grad = !inputs.empty();
+
   /* --- Initialization --- */
   // 1. Init queue with starting nodes
   // 2. Prepare initial input buffers
@@ -124,11 +482,20 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
           paddle::platform::errors::Fatal(
               "Detected size mismatch between tensors and grad_tensors"
               "grad_tensors should either have "
-              "size = 0 or same size as tensors"));
+              "size = 0 or same size as tensors."));
       // Feed given tensor if it's provided
       VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
-      node_input_buffers_dict[grad_node]->add(
-          input_info.first, input_info.second, grad_tensors[i]);
+
+      if (grad_tensors[i].is_initialized()) {
+        // Deep copy
+        paddle::experimental::Tensor tmp_tensor;
+        tmp_tensor.copy_(grad_tensors[i], grad_tensors[i].inner_place(), true);
+        node_input_buffers_dict[grad_node]->add(input_info.first,
+                                                input_info.second, tmp_tensor);
+      } else {
+        node_input_buffers_dict[grad_node]->add(
+            input_info.first, input_info.second, grad_tensors[i]);
+      }
 
     } else {
       VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
@@ -141,8 +508,11 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
           input_info.first, input_info.second, tensor, true /*fill_one=true*/);
     }
 
-    // Prepare queue
+    // Prepare queue, potential startup_nodes
     queue.push(grad_node);
+    if (is_general_grad) {
+      GeneralGrad::Instance().GetPotentialStartupNodes()->emplace(grad_node);
+    }
   }
 
   VLOG(6) << "Update In degree Map for backward";
@@ -150,41 +520,88 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
   std::unordered_map<GradNodeBase*, int> node_in_degree_map =
       getInDegreeMap(queue);
 
+  if (is_general_grad) {
+    // Prepare several vital preprocess for GeneralGrad
+    GeneralGrad::Instance().PreparedForGeneralGrad(inputs, no_grad_vars, &queue,
+                                                   node_input_buffers_dict);
+  }
+
+  VLOG(6) << " startup_ops' size is :" << queue.size();
+
   /* --- Topological Visit --- */
   // 1. Pop queue
   // 2. Run node
+  //    |- Check and capture target result
   //    |- node(grads)
   //    |- Prepare for next node
   // 3. Update queue
   VLOG(6) << "Run Backward";
   while (!queue.empty()) {
     GradNodeBase* node = queue.front();
+    VLOG(6) << "Running GradNode:" << node->name();
+
+    paddle::platform::RecordEvent node_record_event(
+        std::string(typeid(*node).name()) + " grad_node",
+        paddle::platform::TracerEventType::Operator, 1);
 
     if (queue.size() > 1 && node_in_degree_map[node] != 0) {
       queue.pop();
       continue;
     }
     queue.pop();
+
     // Run node: This is where Hook happens
     PADDLE_ENFORCE(
         node_input_buffers_dict.count(node),
         paddle::platform::errors::Fatal(
             "Unable to find next node in the GradTensorHolder \n"
-            "Trying to run Node without configuring its GradTensorHolder"));
+            "Trying to run Node without configuring its GradTensorHolder."));
 
     std::unique_ptr<GradTensorHolder> node_input_buffer =
         std::move(node_input_buffers_dict[node]);
 
-    VLOG(6) << "Run Backward Kernel with GradTensorHolder";
+    // Set input target grad_var from node_input_buffer by inputmeta
+    if (!inputs.empty() && is_general_grad) {
+      GeneralGrad::Instance().SetResultForInputTargetVar(*node_input_buffer,
+                                                         node);
+    }
+
+    // no_grad_vars
+    if (!no_grad_vars.empty() && is_general_grad) {
+      auto iter =
+          GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->find(node);
+      if (iter !=
+          GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->end()) {
+        VLOG(6) << "Change the input buffer[slot][rank] by Zeros";
+        auto rank_info = (iter->second)->OutRankInfo();
+        node_input_buffer->SetBufferSlotRankZeros(rank_info.first,
+                                                  rank_info.second);
+      }
+    }
+
+    VLOG(6) << "Running GradNode:" << node->name();
+
+    // Check input
+    EnforceGradNodeHasInput(node);
+
+    VLOG(6) << "Run Backward Kernel with GradTensorHolder.";
     // Run Pre Backward Node and get outputs
     std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
-        (*node)(node_input_buffer->Buffers());
+        (*node)(node_input_buffer->Buffers(), create_graph);
+
+    // retain_grad or not
+    if (!retain_graph) {
+      VLOG(6)
+          << "retain_graph is false, need to clear the TensorWrapper of nodes.";
+      node->ClearTensorWrappers();
+    }
+
     // TODO(jiabin): Should we erase it or find a more efficient way.
+
     node_input_buffers_dict.erase(node);
 
     // Prepare GradTensorHolder for next node
     const std::vector<std::vector<Edge>>& edges = node->GetEdges();
-
     PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(),
                    paddle::platform::errors::Fatal(
                        "Number of edges should be either empty ( for leaf node "
@@ -195,6 +612,9 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
     for (size_t i = 0; i < edges.size(); i++) {
       for (size_t j = 0; j < edges[i].size(); j++) {
         const Edge& edge = edges[i][j];
+        if (!edge.IsInitialized()) {
+          continue;
+        }
         auto edge_rank = edge.GetEdgeRankInfo();
         // Since we make edge has as same rank as bwd outputs, we indexing them
         // with
@@ -208,6 +628,7 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
             grad_output_tensors[i].empty()) {
           continue;
         }
+
         PADDLE_ENFORCE_LT(
             j, grad_output_tensors[i].size(),
             paddle::platform::errors::Fatal(
@@ -243,18 +664,54 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
 
         // Update queue
         node_in_degree_map[next_node]--;
+
         PADDLE_ENFORCE(
             node_in_degree_map[next_node] >= 0,
             paddle::platform::errors::Fatal(
                 "Detected in-degree value smaller than zero. For Node: %s"
-                "Node's in-degree cannot be negative",
+                "Node's in-degree cannot be negative.",
                 next_node->name()));
-        if (node_in_degree_map[next_node] == 0) {
-          queue.emplace(std::move(next_node));
+
+        if (is_general_grad) {
+          bool is_potential_stop_node =
+              GeneralGrad::Instance().GetPotentialStopNodes()->count(next_node);
+          if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) {
+            queue.emplace(std::move(next_node));
+          }
+        } else {
+          if (node_in_degree_map[next_node] == 0) {
+            queue.emplace(std::move(next_node));
+          }
         }
       }
     }
   }
+  if (!is_general_grad) return {};
+  return GeneralGrad::Instance().GetResults(inputs, allow_unused, create_graph);
+}
+
+void Backward(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph) {
+  VLOG(6) << "Run in Backward";
+  paddle::platform::RecordEvent backward_record_event(
+      "backward", paddle::platform::TracerEventType::Operator, 1);
+  RunBackward(tensors, grad_tensors, retain_graph);
 }
 
+std::vector<paddle::experimental::Tensor> Grad(
+    const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    const std::vector<paddle::experimental::Tensor>& grad_tensors,
+    bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
+  VLOG(6) << "Run in Grad";
+
+  DuplicateCheck(inputs, true /* is_input */);
+  DuplicateCheck(tensors, false /* is_input */);
+
+  return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs,
+                     allow_unused, no_grad_vars);
+}
 }  // namespace egr
diff --git a/paddle/fluid/eager/backward.h b/paddle/fluid/eager/backward.h
index 2856d9fb87f34..bebe664838e6c 100644
--- a/paddle/fluid/eager/backward.h
+++ b/paddle/fluid/eager/backward.h
@@ -19,12 +19,20 @@
 
 namespace egr {
 
-// run_backward():
+// Backward():
 // tensors corresponds to those lived in the backward graph
 // each grad_tensors[i] keeps the value for its corresponding tensors[i]
-void RunBackward(const std::vector<paddle::experimental::Tensor> &tensors,
-                 const std::vector<paddle::experimental::Tensor> &grad_tensors,
-                 bool retain_graph = false);
+void Backward(const std::vector<paddle::experimental::Tensor>& tensors,
+              const std::vector<paddle::experimental::Tensor>& grad_tensors,
+              bool retain_graph = false);
+
+std::vector<paddle::experimental::Tensor> Grad(
+    const std::vector<paddle::experimental::Tensor>& tensors,
+    const std::vector<paddle::experimental::Tensor>& inputs,
+    const std::vector<paddle::experimental::Tensor>& grad_tensors = {},
+    bool retain_graph = false, bool create_graph = false,
+    bool only_inputs = false, bool allow_unused = false,
+    const std::vector<paddle::experimental::Tensor>& no_grad_vars = {});
 
 // Reserved for gradient()
 
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 48ac8c8358afd..08ca3bed5a653 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -20,8 +20,8 @@
 
 namespace egr {
 std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
-operator()(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
+operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+           bool create_graph) {  // NOLINT
   paddle::CustomOpKernelContext ctx;
   auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
       egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h
index e5ddef9c06214..33b56fc8c863a 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.h
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -37,7 +37,8 @@ class RunCustomOpNode : public GradNodeBase {
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false)  // NOLINT
       override;
 
   std::string name() {
@@ -62,6 +63,12 @@ class RunCustomOpNode : public GradNodeBase {
     return res;
   }
 
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
+
   void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
 
  public:
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 7eb2902d935c4..25610a3f95fe5 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -15,10 +15,16 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/var_type.h"
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 
@@ -33,7 +39,6 @@ GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
   VLOG(6) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
   bwd_out_meta_.resize(bwd_out_slot_num);
-  // adj_edges has the same num as backward outputs
   adj_edges_.resize(bwd_out_slot_num);
 }
 
@@ -44,24 +49,22 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
           "Given slot id is out of range of adj_edges outter size, "
           "adj_edges is designed to has the same size of grad "
           "inputs's slot num."));
-  for (const auto& meta : *metas) {
+
+  for (size_t i = 0; i < metas->size(); i++) {
+    const auto& meta = (*metas)[i];
     // adj_edges has as same rank as fwd inputs, and record it's output rank
     // from
     // its pre-ops
     if (meta && !meta->StopGradient()) {
       auto node = meta->GetMutableGradNode();
-      if (node && node.get()) {
-        VLOG(6) << "Add Edges for slot: " << slot_id
-                << " which is: " << meta->GetMutableGradNode()->name();
-        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                         meta->OutRankInfo());
-      } else {
+      if (!node || !node.get()) {
         meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
-        VLOG(6) << "Add Edges for slot: " << slot_id
-                << " which is: " << meta->GetMutableGradNode()->name();
-        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                         meta->OutRankInfo());
       }
+
+      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                       meta->OutRankInfo());
+    } else {
+      adj_edges_[slot_id].emplace_back();
     }
   }
 }
@@ -73,130 +76,227 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
           "Given slot id is out of range of adj_edges outter size, "
           "adj_edges is designed to has the same size of grad "
           "inputs's slot num."));
+
   if (meta && !meta->StopGradient()) {
     auto node = meta->GetMutableGradNode();
-    if (node && node.get()) {
-      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
-              << this->name() << " to " << meta->GetMutableGradNode()->name();
-      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                       meta->OutRankInfo());
-    } else {
+    if (!node || !node.get()) {
       meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
-      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
-              << this->name() << " to " << meta->GetMutableGradNode()->name();
-      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                       meta->OutRankInfo());
     }
+    VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
+            << this->name() << " to " << meta->GetMutableGradNode()->name();
+
+    adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                     meta->OutRankInfo());
+  } else {
+    adj_edges_[slot_id].emplace_back();
   }
 }
 
-const std::vector<GradSlotMeta>& GradNodeBase::InputMeta() const {
+const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::InputMeta() const {
   return bwd_in_meta_;
 }
 
-const std::vector<GradSlotMeta>& GradNodeBase::OutputMeta() const {
+const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::OutputMeta() const {
   return bwd_out_meta_;
 }
 
-void GradNodeBase::SetGradInMeta(std::vector<AutogradMeta*>* fwd_out,
+void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
                                  size_t slot_rank) {
-  size_t slot_size = fwd_out->size();
+  VLOG(6) << "Set GradSlotMeta for Grad Inputs";
+  auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out);
   PADDLE_ENFORCE_LE(
       slot_rank, (bwd_in_meta_.size() - 1),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_in_meta_ size, since "
           "bwd_in_meta_ is designed to hold as same num as backward "
           "inputs."));
-  auto& meta = bwd_in_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_in_meta should only be init once, addition "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
-  // Init stop gradient vector before use to avoid push back
-  meta.Init(slot_size);
-  for (size_t i = 0; i < slot_size; i++) {
-    PADDLE_ENFORCE_NOT_NULL((*fwd_out)[i],
-                            paddle::platform::errors::PreconditionNotMet(
-                                "Bwd_in_meta should only be called while "
-                                "autograd_meta is not null. If you got this "
-                                "error, it indicates bugs in framework."));
-    if ((*fwd_out)[i]->StopGradient()) {
-      // Set Stop Gradient only when its true or non-initialized autograd_meta,
-      // since all default value is false.
-      meta.SetStopGradient(i, (*fwd_out)[i]->StopGradient());
+  auto& metas = bwd_in_meta_.at(slot_rank);
+  if (metas.size() == 0) {
+    metas.resize(1);
+  }
+
+  auto& meta = metas[0];
+  meta.SetStopGradient(fwd_out_meta->StopGradient());
+
+  if (!fwd_out.is_initialized()) {
+    VLOG(6)
+        << "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
+    return;
+  }
+
+  // Record TensorMeta
+  if (phi::DenseTensor::classof(fwd_out.impl().get())) {
+    // Only Copy Meta
+    phi::DenseTensor* dense_tensor =
+        static_cast<phi::DenseTensor*>(fwd_out.impl().get());
+
+    PADDLE_ENFORCE_NE(
+        dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+        paddle::platform::errors::Fatal(
+            "Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED,"
+            "which is illegal."));
+
+    meta.SetTensorMeta(dense_tensor->meta());
+    meta.SetPlace(fwd_out.inner_place());
+
+    if (paddle::framework::IsComplexType(
+            paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
+      need_complex_to_real_ = true;
     }
+  } else {
+    VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+               "non-DenseTensor argument.";
   }
 }
 
-void GradNodeBase::SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank) {
+void GradNodeBase::SetGradInMeta(
+    const std::vector<paddle::experimental::Tensor>& fwd_out,
+    size_t slot_rank) {
+  VLOG(6) << "Set GradSlotMeta for Grad Inputs";
+  size_t slot_size = fwd_out.size();
   PADDLE_ENFORCE_LE(
       slot_rank, (bwd_in_meta_.size() - 1),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_in_meta_ size, since "
           "bwd_in_meta_ is designed to hold as same num as backward "
           "inputs."));
-  auto& meta = bwd_in_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_in_meta should only be init once, Additional "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
+  auto& metas = bwd_in_meta_.at(slot_rank);
   // Init stop gradient vector before use to avoid push back
-  VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank;
-  meta.Init(1);
-  meta.SetStopGradient(0, fwd_out->StopGradient());
+  if (metas.size() < slot_size) {
+    VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank;
+    metas.resize(slot_size);
+  }
+  for (size_t i = 0; i < slot_size; i++) {
+    auto& meta = metas[i];
+    const auto& fwd_out_tensor = fwd_out[i];
+    auto* fwd_out_meta =
+        egr::EagerUtils::nullable_autograd_meta(fwd_out_tensor);
+    PADDLE_ENFORCE_NOT_NULL(fwd_out_meta,
+                            paddle::platform::errors::PreconditionNotMet(
+                                "Bwd_in_meta should only be called while "
+                                "autograd_meta is not null. If you got this "
+                                "error, it indicates bugs in framework."));
+    if (fwd_out_meta->StopGradient()) {
+      // Set Stop Gradient only when its true or non-initialized autograd_meta,
+      // since all default value is false.
+      meta.SetStopGradient(fwd_out_meta->StopGradient());
+    }
+
+    if (!fwd_out_tensor.is_initialized()) {
+      VLOG(6)
+          << "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
+      return;
+    }
+
+    // Record TensorMeta
+    if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) {
+      // Only Copy Meta
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(fwd_out_tensor.impl().get());
+
+      PADDLE_ENFORCE_NE(
+          dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+          paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta "
+                                          "with phi::DataType::UNDEFINED,"
+                                          "which is illegal."));
+      meta.SetTensorMeta(dense_tensor->meta());
+      meta.SetPlace(fwd_out_tensor.inner_place());
+
+      if (paddle::framework::IsComplexType(
+              paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
+        need_complex_to_real_ = true;
+      }
+    } else {
+      VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
+                 "with non-DenseTensor argument.";
+    }
+  }
 }
 
-void GradNodeBase::SetGradOutMeta(std::vector<AutogradMeta*>* fwd_in,
+void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
                                   size_t slot_rank) {
-  size_t slot_size = fwd_in->size();
+  auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in);
   PADDLE_ENFORCE_LE(
-      slot_rank, (bwd_out_meta_.size() - 1),
+      (slot_rank + 1), bwd_out_meta_.size(),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_out_meta_ size, "
           "since bwd_out_meta_ is designed to hold as same num as "
           "backward outputs."));
-  auto& meta = bwd_out_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_out_meta should only be init once. Additional "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
+  auto& metas = bwd_out_meta_.at(slot_rank);
   // Init stop gradient vector before use to avoid push back
-  meta.Init(slot_size);
-  for (size_t i = 0; i < slot_size; i++) {
-    if (!(*fwd_in)[i]) {
-      meta.SetStopGradient(i, true);
-      continue;
-    }
-    if ((*fwd_in)[i]->StopGradient()) {
-      // Set Stop Gradient only when its true or non-initialized autograd_meta,
-      // since all default value is false.
-      meta.SetStopGradient(i, (*fwd_in)[i]->StopGradient());
+  if (metas.size() == 0) {
+    metas.resize(1);
+  }
+  auto& meta = metas[0];
+  if (fwd_in_meta) {
+    meta.SetStopGradient(fwd_in_meta->StopGradient());
+  } else {
+    meta.SetStopGradient(true);
+  }
+
+  // Record TensorMeta
+  if (fwd_in.impl() && fwd_in.impl().get()) {
+    if (phi::DenseTensor::classof(fwd_in.impl().get())) {
+      // Only Copy Meta
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(fwd_in.impl().get());
+      PADDLE_ENFORCE_NE(
+          dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+          paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta "
+                                          "with phi::DataType::UNDEFINED,"
+                                          "which is illegal."));
+      meta.SetTensorMeta(dense_tensor->meta());
+      meta.SetPlace(fwd_in.inner_place());
     }
+  } else {
+    VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+               "non-DenseTensor argument.";
   }
 }
 
-void GradNodeBase::SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank) {
+void GradNodeBase::SetGradOutMeta(
+    const std::vector<paddle::experimental::Tensor>& fwd_in, size_t slot_rank) {
+  size_t slot_size = fwd_in.size();
   PADDLE_ENFORCE_LE(
-      (slot_rank + 1), bwd_out_meta_.size(),
+      slot_rank, (bwd_out_meta_.size() - 1),
       paddle::platform::errors::InvalidArgument(
           "Slot Rank should less equal than bwd_out_meta_ size, "
           "since bwd_out_meta_ is designed to hold as same num as "
           "backward outputs."));
-  auto& meta = bwd_out_meta_.at(slot_rank);
-  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Bwd_out_meta should only be init once. Additional "
-                        "initialization for it is forbidden. If you got this "
-                        "error, it indicates bugs in framework."));
+  auto& metas = bwd_out_meta_.at(slot_rank);
   // Init stop gradient vector before use to avoid push back
-  meta.Init(1);
-  if (fwd_in) {
-    meta.SetStopGradient(0, fwd_in->StopGradient());
-  } else {
-    meta.SetStopGradient(0, true);
+  if (metas.size() < slot_size) {
+    metas.resize(slot_size);
+  }
+  for (size_t i = 0; i < slot_size; i++) {
+    const auto& fwd_in_tensor = fwd_in[i];
+    auto& meta = metas[i];
+    auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in_tensor);
+    if (fwd_in_meta) {
+      // Set Stop Gradient only when its true or non-initialized autograd_meta,
+      // since all default value is false.
+      meta.SetStopGradient(fwd_in_meta->StopGradient());
+    }
+
+    // Record TensorMeta
+    if (fwd_in_tensor.impl() && fwd_in_tensor.impl().get()) {
+      if (phi::DenseTensor::classof(fwd_in_tensor.impl().get())) {
+        // Only Copy Meta
+        phi::DenseTensor* dense_tensor =
+            static_cast<phi::DenseTensor*>(fwd_in_tensor.impl().get());
+
+        PADDLE_ENFORCE_NE(dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+                          paddle::platform::errors::Fatal(
+                              "Attempting to copy DenseTensorMeta with "
+                              "phi::DataType::UNDEFINED,"
+                              "which is illegal."));
+        meta.SetTensorMeta(dense_tensor->meta());
+        meta.SetPlace(fwd_in_tensor.inner_place());
+      }
+    } else {
+      VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
+                 "with non-DenseTensor argument.";
+    }
   }
 }
 
@@ -207,12 +307,8 @@ void GradNodeBase::SetDefaultGradInOutMeta() {
                      "meta setter, other size of inputs and outputs should "
                      "create with Setter and Getters"));
   // Default stop_gradient is false and slot id is 0, slot size is 1;
-  bwd_out_meta_[0].Init(1);
-  bwd_in_meta_[0].Init(1);
-}
-
-const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
-  return adj_edges_;
+  bwd_out_meta_[0].resize(1);
+  bwd_in_meta_[0].resize(1);
 }
 
 int64_t GradNodeBase::RegisterGradientHook(
@@ -222,6 +318,10 @@ int64_t GradNodeBase::RegisterGradientHook(
   return next_hook_id_++;
 }
 
+const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
+  return adj_edges_;
+}
+
 std::vector<std::vector<paddle::experimental::Tensor>>
 GradNodeBase::ApplyGradientHooks(
     const std::vector<std::vector<paddle::experimental::Tensor>>& tensors) {
@@ -270,4 +370,45 @@ GradNodeBase::ApplyGradientHooks(
   return outs;
 }
 
+void GradNodeBase::HandleComplexGradToRealGrad(
+    std::vector<std::vector<paddle::experimental::Tensor>>* out_grads) {
+  for (size_t slot_id = 0; slot_id < out_grads->size(); slot_id++) {
+    const std::vector<paddle::experimental::Tensor>& slot_out_grads =
+        (*out_grads)[slot_id];
+    for (size_t rank_id = 0; rank_id < slot_out_grads.size(); rank_id++) {
+      const GradSlotMeta& slot_meta = bwd_out_meta_[slot_id][rank_id];
+
+      PADDLE_ENFORCE(
+          slot_meta.HasTensorMeta() > 0,
+          paddle::platform::errors::Fatal(
+              "We require TensorMeta in GradInputMeta() to obtain forward data "
+              "types."
+              "However, no TensorMeta is detected in bwd_out_meta_."));
+
+      auto fwd_data_type = paddle::framework::TransToProtoVarType(
+          slot_meta.GetTensorMeta().dtype);
+      const paddle::experimental::Tensor& grad = slot_out_grads[rank_id];
+
+      if (paddle::framework::IsComplexType(fwd_data_type)) continue;
+
+      // Only Handle Complex To Real for DenseTensor for now
+      if (phi::DenseTensor::classof(grad.impl().get())) {
+        phi::DenseTensor* grad_dense_tensor =
+            static_cast<phi::DenseTensor*>(grad.impl().get());
+
+        auto curr_data_type =
+            paddle::framework::TransToProtoVarType(grad_dense_tensor->type());
+        if (!paddle::framework::IsComplexType(curr_data_type)) continue;
+
+        // Convert Complex GradOut to Real
+        auto out = std::make_shared<phi::DenseTensor>();
+        paddle::framework::TransComplexToReal(fwd_data_type, curr_data_type,
+                                              *grad_dense_tensor, out.get());
+
+        (*out_grads)[slot_id][rank_id].set_impl(out);
+      }
+    }
+  }
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 16513f05e0777..4dec1c1f9f4e5 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -57,21 +57,32 @@ class AutogradMeta;
 class GradSlotMeta {
  public:
   GradSlotMeta() = default;
-  void Init(size_t size) {
-    size_ = static_cast<int>(size);
-    stop_gradient_.resize(size, false);
+  bool IsStopGradient() const { return stop_gradient_; }
+  void SetStopGradient(bool stop_gradient = true) {
+    stop_gradient_ = stop_gradient;
   }
 
-  bool IsInitialized() const { return size_ != -1; }
-  bool IsStopGradient(size_t rank) const { return stop_gradient_[rank]; }
-  int Size() const { return size_; }
-  void SetStopGradient(size_t rank, bool stop_gradient = true) {
-    stop_gradient_.at(rank) = stop_gradient;
+  void SetTensorMeta(const phi::DenseTensorMeta& meta) {
+    meta_ = std::make_shared<phi::DenseTensorMeta>(meta);
+  }
+  bool HasTensorMeta() const { return meta_ && meta_.get(); }
+  const phi::DenseTensorMeta& GetTensorMeta() const {
+    if (!HasTensorMeta()) {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "meta_ of GradSlotMeta has not been initialized yet."
+          "You're expected to check Edge availability with HasTensorMeta()"
+          "before calling GetTensorMeta() interface."));
+    }
+    return *meta_.get();
   }
 
+  void SetPlace(const phi::Place& place) { place_ = place; }
+  const phi::Place& GetPlace() const { return place_; }
+
  private:
-  int size_{-1};
-  std::vector<bool> stop_gradient_{false};
+  bool stop_gradient_{false};
+  phi::Place place_;
+  std::shared_ptr<phi::DenseTensorMeta> meta_ = nullptr;
 };
 
 class GradNodeBase {
@@ -95,8 +106,12 @@ class GradNodeBase {
    * is better choice to fit this format.
    * **/
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads) = 0;
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
+      bool create_graph = false) = 0;
+
+  virtual void ClearTensorWrappers() = 0;
 
+  virtual bool IsTensorWrappersCleared() = 0;
   /**
    * AddEdges is designed to set input tensors' backward Node as current
    * node's Edges.
@@ -108,25 +123,30 @@ class GradNodeBase {
   void AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id);
   void AddEdges(AutogradMeta* meta, size_t slot_id);
 
-  /**
-   * GetEdges is designed to get all edges of current node**/
-  const std::vector<std::vector<Edge>>& GetEdges() const;
+  // adj_edges were moved inside OutputMeta(), so no available direct access
+  // from GradNodeBase.
+  // To access Edges, get GradSlotMeta by calling OutputMeta(), then use
+  // slot_meta.GetEdge()
 
   /**
    * Get Input Meta of current Grad node**/
-  const std::vector<GradSlotMeta>& InputMeta() const;
+  const std::vector<std::vector<GradSlotMeta>>& InputMeta() const;
   /**
    * Get Output Meta of current Grad node**/
-  const std::vector<GradSlotMeta>& OutputMeta() const;
+  const std::vector<std::vector<GradSlotMeta>>& OutputMeta() const;
   /**
    * Set bwd ins and outs info with forward vars
    * **/
 
-  void SetGradInMeta(std::vector<AutogradMeta*>* fwd_out, size_t slot_rank);
-  void SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank);
+  void SetGradInMeta(const std::vector<paddle::experimental::Tensor>& fwd_out,
+                     size_t slot_rank);
+  void SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
+                     size_t slot_rank);
 
-  void SetGradOutMeta(std::vector<AutogradMeta*>* fwd_in, size_t slot_rank);
-  void SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank);
+  void SetGradOutMeta(const std::vector<paddle::experimental::Tensor>& fwd_in,
+                      size_t slot_rank);
+  void SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
+                      size_t slot_rank);
 
   /**
    * Default setters for Grad in/out meta this should be used for same special
@@ -158,11 +178,21 @@ class GradNodeBase {
   std::vector<std::vector<paddle::experimental::Tensor>> ApplyGradientHooks(
       const std::vector<std::vector<paddle::experimental::Tensor>>& tensors);
 
+  /**
+    * Handle Complex - Real Type Promotion
+    * **/
+  void HandleComplexGradToRealGrad(
+      std::vector<std::vector<paddle::experimental::Tensor>>* out_grads);
+  bool NeedComplexToRealConversion() { return need_complex_to_real_; }
+
   virtual std::string name() { return "GradNodeBase"; }
 
- private:
-  // TODO(jiabin): Use SmallVector instead after merge PR from develop
+  /**
+       * GetEdges is designed to get all edges of current node**/
+  const std::vector<std::vector<Edge>>& GetEdges() const;
 
+ private:
+  // TODO(zhanlve): Merge adj_edges_ into GradOutMeta
   // Edges recorded the backward related node info, which indicate all edges
   // linked
   // by this Grad Node.
@@ -170,10 +200,10 @@ class GradNodeBase {
   std::vector<std::vector<Edge>> adj_edges_;
 
   // bwd_out_meta_ is used to record Grad output info for backward
-  std::vector<GradSlotMeta> bwd_out_meta_;
+  std::vector<std::vector<GradSlotMeta>> bwd_out_meta_;
 
   // bwd_in_meta_ used to record Grad input info for backward
-  std::vector<GradSlotMeta> bwd_in_meta_;
+  std::vector<std::vector<GradSlotMeta>> bwd_in_meta_;
   // Gradient Hooks
   // Customer may register a list of hooks which will be called in order during
   // backward
@@ -184,6 +214,8 @@ class GradNodeBase {
                         /* hook */ std::shared_ptr<TensorHook>>>
       gradient_hooks_;
 
+  // We handle complex to real conversion only if any complex GradIn is involved
+  bool need_complex_to_real_ = false;
   int64_t next_hook_id_{0};
 };
 
@@ -229,12 +261,22 @@ class Edge {
   }
 
   // Currently we use grad_node_ to identify if a edge is initialized.
-  bool IsInitialized() const { return grad_node_.get(); }
+  bool IsInitialized() const {
+    if (!grad_node_) {
+      return false;
+    } else {
+      if (!(grad_node_.get())) {
+        return false;
+      } else {
+        return true;
+      }
+    }
+  }
 
  private:
   size_t in_slot_id_;
   size_t in_rank_;
-  std::shared_ptr<GradNodeBase> grad_node_;
+  std::shared_ptr<GradNodeBase> grad_node_{nullptr};
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 69fc7df2f1420..038ad09aa4d8b 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -21,6 +21,11 @@
 
 namespace egr {
 
+void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) {
+  buffer_[slot_id][rank] =
+      paddle::experimental::zeros_like(buffer_[slot_id][rank]);
+}
+
 void GradTensorHolder::add(size_t slot_id, size_t rank,
                            const paddle::experimental::Tensor& t,
                            bool fill_one) {
@@ -88,7 +93,7 @@ void GradTensorHolder::add(size_t slot_id, size_t rank,
     // Create new tensor->impl and fill it with 1.0
     if (t.defined()) {
       // Fill 1.0
-      buffer_[slot_id][rank] = paddle::experimental::ones_like(t);
+      buffer_[slot_id][rank] = paddle::experimental::ones_like(t, t.dtype());
     }
   }
 }
diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h
index d66a81fe82859..db03789ea7632 100644
--- a/paddle/fluid/eager/grad_tensor_holder.h
+++ b/paddle/fluid/eager/grad_tensor_holder.h
@@ -26,12 +26,13 @@ namespace egr {
  * GradTensorHolder should have as same format as forward output **/
 class GradTensorHolder {
  public:
-  explicit GradTensorHolder(const std::vector<GradSlotMeta>& meta) {
-    VLOG(7) << "Init GradTensorHolder with meta size: " << meta.size();
-    buffer_.resize(meta.size());
+  explicit GradTensorHolder(
+      const std::vector<std::vector<GradSlotMeta>>& metas) {
+    VLOG(7) << "Init GradTensorHolder with meta size: " << metas.size();
+    buffer_.resize(metas.size());
     for (size_t i = 0; i < buffer_.size(); i++) {
-      VLOG(7) << "Init GradTensorHolder with meta rank: " << meta[i].Size();
-      buffer_[i].resize(meta[i].Size());
+      VLOG(7) << "Init GradTensorHolder with meta rank: " << metas[i].size();
+      buffer_[i].resize(metas[i].size());
     }
   }
 
@@ -52,10 +53,12 @@ class GradTensorHolder {
     return buffer_[pos];
   }
 
-  const std::vector<std::vector<paddle::experimental::Tensor>>& Buffers() {
+  std::vector<std::vector<paddle::experimental::Tensor>>& Buffers() {
     return buffer_;
   }
 
+  void SetBufferSlotRankZeros(size_t slot_id, size_t rank);
+
  private:
   std::vector<std::vector<paddle::experimental::Tensor>> buffer_;
 };
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 31aaa93c41643..8da27f3bb8a13 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -36,6 +36,15 @@ class TensorWrapper {
   explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
                          bool full_reserved = false,
                          bool no_need_buffer = false) {
+    // set inplace_version_snapshot_ according to tensor's current inplace
+    // version.
+    if (tensor.impl() && phi::DenseTensor::classof(tensor.impl().get())) {
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(tensor.impl().get());
+      auto& inplace_version_counter = dense_tensor->InplaceVersionCounter();
+      inplace_version_snapshot_ = inplace_version_counter.CurrentVersion();
+    }
+
     /**
      * Normally, we should fully reserved all non-output or non-leaf fwd tensor
      * here. And for fwd output tensor, we should not reserve its autogradmeta,
@@ -49,6 +58,7 @@ class TensorWrapper {
     }
 
     // shallow copy tensor_impl here
+    no_need_buffer_ = no_need_buffer;
     if (no_need_buffer) {
       if (phi::DenseTensor::classof(tensor.impl().get())) {
         // Only Copy Meta
@@ -86,6 +96,7 @@ class TensorWrapper {
 
     // if it's full_reserved just return the full copy of tensor
     if (full_reserved_) {
+      check_inplace_version();
       return intermidiate_tensor_;
     } else {
       std::shared_ptr<GradNodeBase> new_grad_node = grad_node;
@@ -94,13 +105,52 @@ class TensorWrapper {
       intermidiate_tensor_.set_autograd_meta(
           std::static_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
               p_ab_autograd_meta));
+      check_inplace_version();
       return intermidiate_tensor_;
     }
   }
 
+  void check_inplace_version() {
+    if (no_need_buffer_) {
+      VLOG(6) << "There's no need to check inplace_version because "
+                 "no_need_buffer_ is true.";
+      return;
+    }
+    if (intermidiate_tensor_.impl() &&
+        phi::DenseTensor::classof(intermidiate_tensor_.impl().get())) {
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(intermidiate_tensor_.impl().get());
+      auto& inplace_version_counter = dense_tensor->InplaceVersionCounter();
+
+      uint32_t current_inplace_version =
+          inplace_version_counter.CurrentVersion();
+      PADDLE_ENFORCE_EQ(
+          current_inplace_version, inplace_version_snapshot_,
+          paddle::platform::errors::PermissionDenied(
+              "Tensor '%s' used in gradient computation has been "
+              "modified by an inplace operation. "
+              "Its version is %d but the expected version is %d. "
+              "Please fix your code to void calling an inplace operator "
+              "after using the Tensor which will used in gradient "
+              "computation.",
+              intermidiate_tensor_.name(), current_inplace_version,
+              inplace_version_snapshot_));
+      VLOG(6) << " The inplace_version_snapshot_ of Tensor '"
+              << intermidiate_tensor_.name() << "' is [ "
+              << inplace_version_snapshot_ << " ]";
+      VLOG(6) << " The current_inplace_version of Tensor '"
+              << intermidiate_tensor_.name() << "' is [ "
+              << current_inplace_version << " ]";
+    }
+  }
+
+  void clear() { intermidiate_tensor_.reset(); }
+
  private:
   bool full_reserved_ = false;
+  bool no_need_buffer_ = false;
   std::pair<size_t, size_t> out_rank_info_;
   paddle::experimental::Tensor intermidiate_tensor_;
+  uint32_t inplace_version_snapshot_ = 0;
 };
 }  // namespace egr
diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index 28682ab0fe094..6c6c7fd25e5e5 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -80,13 +80,15 @@ TEST(AccumulationNode, Tensor) {
   grad_meta->SetStopGradient(false);
 
   // operator()
-  paddle::experimental::Tensor ret_et0 = node->operator()({{et0}})[0][0];
+  std::vector<std::vector<paddle::experimental::Tensor>> et0_vec = {{et0}};
+  paddle::experimental::Tensor ret_et0 = node->operator()(et0_vec)[0][0];
   auto* ret_et0_ptr =
       std::dynamic_pointer_cast<phi::DenseTensor>(ret_et0.impl())
           ->data<paddle::platform::float16>();
   CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f));
 
-  paddle::experimental::Tensor ret_et1 = node->operator()({{et1}})[0][0];
+  std::vector<std::vector<paddle::experimental::Tensor>> et1_vec = {{et1}};
+  paddle::experimental::Tensor ret_et1 = node->operator()(et1_vec)[0][0];
 
   auto* ret_et1_ptr =
       std::dynamic_pointer_cast<phi::DenseTensor>(ret_et1.impl())
@@ -121,7 +123,7 @@ TEST(AccumulationNode, Tensor) {
       std::make_shared<egr::CppTensorVoidHook>(reduce_hook_1));
 
   // operator()
-  paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0];
+  paddle::experimental::Tensor _ret = node->operator()(et0_vec)[0][0];
 
   // Check operator() result, should be 36.0
   auto* _ret_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(_ret.impl())
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index 1683f4ed5fbe5..5fec38bf25a43 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -17,6 +17,14 @@
 
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT);
+#endif
 
 namespace eager_test {
 using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta;
@@ -82,7 +90,7 @@ TEST(Tensor, MemberFunction) {
   VLOG(6) << "Set impl";
   CHECK_EQ(et3.initialized(), true);
   CHECK_EQ(et3.is_cpu(), true);
-  CHECK_EQ(et3.is_cuda(), false);
+  CHECK_EQ(et3.is_gpu(), false);
   CHECK_EQ(et3.numel(), 2);
   auto expected_dim = phi::make_ddim({1, 2});
   CHECK_EQ(et3.dims(), expected_dim);
@@ -151,5 +159,50 @@ TEST(EagerVariable, Constructor) {
   CHECK_EQ(dt3_tmp_ptr[1], 10.0f);
   t4.reset();
   CHECK(t4.defined() == false);
+
+  VLOG(6) << "Check Tensor Copy_";
+  std::vector<int64_t> rows = {1, 2};
+  std::vector<int64_t> dims = {2};
+  paddle::experimental::Tensor t7(std::make_shared<phi::SelectedRows>(rows, 2));
+  std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
+      ->mutable_value()
+      ->Resize(phi::make_ddim(dims));
+  auto* dt7_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
+                          ->mutable_value()
+                          ->mutable_data<float>(paddle::platform::CPUPlace());
+  dt7_tmp_ptr[0] = 6.0f;
+  dt7_tmp_ptr[1] = 11.0f;
+
+  paddle::experimental::Tensor t8;
+  paddle::experimental::Tensor t5;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  paddle::experimental::Tensor t6;
+  paddle::experimental::Tensor t9;
+  VLOG(6) << "Check Tensor Copy_ Selected Rows";
+  t8.copy_(t7, paddle::platform::CUDAPlace(0), true);
+  t9.copy_(t8, paddle::platform::CPUPlace(), true);
+  auto* dt9_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())
+                          ->value()
+                          .data<float>();
+  CHECK_EQ(dt9_tmp_ptr[0], 6.0f);
+  CHECK_EQ(dt9_tmp_ptr[1], 11.0f);
+  CHECK_EQ(std::dynamic_pointer_cast<phi::SelectedRows>(t9.impl())->height(),
+           2);
+
+  VLOG(6) << "Check Tensor Copy_ Dense Tensor";
+  t5.copy_(t3, paddle::platform::CUDAPlace(0), true);
+  t6.copy_(t5, paddle::platform::CPUPlace(), true);
+  auto* dt6_tmp_ptr =
+      std::dynamic_pointer_cast<phi::DenseTensor>(t6.impl())->data<float>();
+  CHECK_EQ(dt6_tmp_ptr[0], 5.0f);
+  CHECK_EQ(dt6_tmp_ptr[1], 10.0f);
+#else
+  t5.copy_(t3, paddle::platform::CPUPlace(), true);
+  auto* dt5_tmp_ptr =
+      std::dynamic_pointer_cast<phi::DenseTensor>(t5.impl())->data<float>();
+  CHECK_EQ(dt5_tmp_ptr[0], 5.0f);
+  CHECK_EQ(dt5_tmp_ptr[1], 10.0f);
+#endif
+
   VLOG(6) << "Finish";
 }
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index e3db309c4016a..d592b5ccf66ff 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
@@ -23,14 +24,9 @@
 
 TEST(GradNodeInfo, GradSlotMeta) {
   auto grad_slot = egr::GradSlotMeta();
-  CHECK(grad_slot.IsInitialized() == false);
-  VLOG(6) << "Init GradSlotMeta";
-  grad_slot.Init(2);
-  CHECK(grad_slot.IsInitialized() == true);
   VLOG(6) << "Set SetStopGradient";
-  grad_slot.SetStopGradient(0);
-  CHECK(grad_slot.IsStopGradient(0) == true);
-  CHECK_EQ(grad_slot.Size(), 2);
+  grad_slot.SetStopGradient();
+  CHECK(grad_slot.IsStopGradient() == true);
 }
 
 void TestGradNodeBase(bool is_remove_gradient_hook) {
@@ -56,18 +52,22 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
                ->data<float>()[0],
            6.0f);
   VLOG(6) << "Test Add Edges";
-  egr::Edge edge0(grad_test_node1, 1, 2);
-  auto auto_grad0 = std::make_shared<egr::AutogradMeta>(edge0);
+  egr::Edge tmp_edge0(grad_test_node1, 1, 2);
+  auto auto_grad0 = std::make_shared<egr::AutogradMeta>(tmp_edge0);
   auto_grad0->SetStopGradient(false);
-  egr::Edge edge1(grad_test_node1, 3, 4);
-  auto auto_grad1 = std::make_shared<egr::AutogradMeta>(edge1);
+
+  egr::Edge tmp_edge1(grad_test_node1, 3, 4);
+  auto auto_grad1 = std::make_shared<egr::AutogradMeta>(tmp_edge1);
+  et1.set_autograd_meta(auto_grad1);
   auto_grad1->SetStopGradient(false);
   grad_test_node0->AddEdges(auto_grad0.get(), 0);
+
   CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first,
            size_t(1));
   CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second,
            size_t(2));
   std::vector<egr::AutogradMeta*> metas = {auto_grad1.get()};
+
   grad_test_node0->AddEdges(&metas, 1);
   CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().first,
            size_t(3));
@@ -76,22 +76,30 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
 
   VLOG(6) << "Test Set Meta and Get Meta";
   auto_grad1->SetStopGradient(true);
-  grad_test_node0->SetGradInMeta(&metas, 0);
-  grad_test_node0->SetGradInMeta(auto_grad1.get(), 1);
-  grad_test_node0->SetGradOutMeta(&metas, 0);
-  grad_test_node0->SetGradOutMeta(auto_grad1.get(), 1);
-  CHECK_EQ(grad_test_node0->InputMeta()[0].Size(), 1);
-  CHECK_EQ(grad_test_node0->InputMeta()[1].Size(), 1);
-  CHECK(grad_test_node0->OutputMeta()[0].IsStopGradient(0));
-  CHECK(grad_test_node0->OutputMeta()[1].IsStopGradient(0));
+  grad_test_node0->SetGradInMeta(et1, 0);
+  grad_test_node0->SetGradInMeta({et1}, 1);
+  grad_test_node0->SetGradOutMeta(et1, 0);
+  grad_test_node0->SetGradOutMeta({et1}, 1);
+  CHECK_EQ(grad_test_node0->InputMeta()[0].size(), size_t(1));
+  CHECK_EQ(grad_test_node0->InputMeta()[1].size(), size_t(1));
+  CHECK_EQ(grad_test_node0->InputMeta()[0][0].GetTensorMeta().dtype,
+           meta.dtype);
+  CHECK_EQ(grad_test_node0->InputMeta()[1][0].GetTensorMeta().dtype,
+           meta.dtype);
+  CHECK(grad_test_node0->OutputMeta()[0][0].IsStopGradient());
+  CHECK(grad_test_node0->OutputMeta()[1][0].IsStopGradient());
+  CHECK_EQ(grad_test_node0->OutputMeta()[0][0].GetTensorMeta().dtype,
+           meta.dtype);
+  CHECK_EQ(grad_test_node0->OutputMeta()[1][0].GetTensorMeta().dtype,
+           meta.dtype);
 
   VLOG(6) << "Test Default Set Meta and Get Meta";
   auto grad_test_node2 = std::make_shared<eager_test::GradTestNode>(
       /* val */ 5.0, /* in_num */ 1, /* out_num */ 1);
   grad_test_node2->SetDefaultGradInOutMeta();
-  CHECK(grad_test_node2->OutputMeta()[0].IsInitialized());
-  CHECK(grad_test_node2->OutputMeta()[0].IsStopGradient(0) == false);
-  CHECK_EQ(grad_test_node2->OutputMeta()[0].Size(), 1);
+  CHECK_GT(grad_test_node2->OutputMeta()[0].size(), size_t(0));
+  CHECK(grad_test_node2->OutputMeta()[0][0].IsStopGradient() == false);
+  CHECK_EQ(grad_test_node2->OutputMeta()[0].size(), size_t(1));
 
   VLOG(6) << "Test Gradient Hook";
   auto gradient_hook = [](
@@ -135,7 +143,17 @@ TEST(GradNodeInfo, GradNodeBase) {
 }
 
 TEST(GradNodeInfo, Edge) {
+  phi::DenseTensorMeta meta =
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+  std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
+      meta);
+  paddle::experimental::Tensor et1(dt);
+
   auto grad_test_node0 = std::make_shared<eager_test::GradTestNode>(5, 2, 2);
+  auto auto_grad1 = std::make_shared<egr::AutogradMeta>();
   VLOG(6) << "Test Construct Edge";
   egr::Edge edge0 = egr::Edge();
   CHECK(edge0.IsInitialized() == false);
@@ -145,13 +163,12 @@ TEST(GradNodeInfo, Edge) {
       egr::Edge(grad_test_node0, std::make_pair(size_t(1), size_t(0)));
   VLOG(6) << "Test Set Edge's Grad Node";
   auto* grad_node = edge1.GetGradNode();
+  et1.set_autograd_meta(auto_grad1);
+  grad_node->SetGradInMeta(et1, 0);
+
   CHECK_EQ(grad_node->InputMeta().size(), size_t(2));
-  auto mt_grad_node = edge1.GetMutableGradNode();
-  auto auto_grad1 = std::make_shared<egr::AutogradMeta>();
   std::vector<egr::AutogradMeta*> metas = {auto_grad1.get()};
-  // Uninitialized AutogradMeta indicates
-  mt_grad_node->SetGradInMeta(&metas, 0);
-  CHECK(grad_node->InputMeta()[0].IsStopGradient(0) == true);
+  CHECK(grad_node->InputMeta()[0][0].IsStopGradient() == true);
   VLOG(6) << "Test Get/Set Edge Rank Info";
   CHECK_EQ(edge2.GetEdgeRankInfo().first, size_t(1));
   CHECK_EQ(edge2.GetEdgeRankInfo().second, size_t(0));
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index 535c93ac53b17..dff12fdfc34a1 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase {
   GradTestNode() : GradNodeBase() { val_ = 1.0; }
   std::string name() override { return "GradTestNode"; }
   std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
-      override {
+      std::vector<std::vector<paddle::experimental::Tensor>>& grads,
+      bool create_graph = false) override {
     val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
                ->data<float>()[0];
     phi::DenseTensorMeta meta =
@@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase {
     std::vector<std::vector<paddle::experimental::Tensor>> res = {{et1}};
     return res;
   }
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
   float val_;
 };
 }  // namespace eager_test
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 384fdcd6f97c4..645eac06ddda5 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -30,8 +30,7 @@ PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
 using namespace egr;  // NOLINT
 
 TEST(GradTensorHolder, Constructor) {
-  GradSlotMeta slot_meta;
-  slot_meta.Init(1);
+  std::vector<GradSlotMeta> slot_meta(1);
   GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta});
   GradTensorHolder grad_tensor_holder2 = GradTensorHolder(grad_tensor_holder);
 
@@ -72,8 +71,7 @@ TEST(GradTensorHolder, Interfaces) {
   paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1);
 
   // Constructor empty GradTensorHolder
-  GradSlotMeta slot_meta;
-  slot_meta.Init(1);
+  std::vector<GradSlotMeta> slot_meta(1);
   GradTensorHolder grad_tensor_holder =
       GradTensorHolder({slot_meta, slot_meta});
 
@@ -138,8 +136,7 @@ TEST(GradTensorHolder, SelectedRowsMergeAdd) {
   paddle::experimental::Tensor t2(sr2);
 
   // Constructor empty GradTensorHolder
-  GradSlotMeta slot_meta;
-  slot_meta.Init(1);
+  std::vector<GradSlotMeta> slot_meta(1);
   GradTensorHolder grad_tensor_holder =
       GradTensorHolder({slot_meta, slot_meta});
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index 769bd7f687f45..c8fb6050e9d45 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -37,7 +37,7 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
 
-static size_t max_num_benchmark_runs = 5000;
+static size_t max_num_benchmark_runs = 4000;
 
 namespace egr {
 
@@ -58,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 10)
@@ -80,7 +80,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 2)
@@ -106,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 2)
@@ -137,7 +137,7 @@ void benchmark_eager_intermediate_mlp(
       reduce_sum_dygraph_function(input0, {{"reduce_all", true}});
 
   std::vector<paddle::experimental::Tensor> target_tensors = {Out};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   if (accuracy_check) {
     std::unordered_map<std::string, float> result =
diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
index c65ad4641cf22..52dba6b9218c7 100644
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
@@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_
 cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
+cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node)
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index 0c894ed267fcd..87f8f6eca1f88 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -33,6 +33,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
 namespace egr {
 
@@ -79,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
   }
   std::vector<paddle::experimental::Tensor> outs = {target_tensor};
   // Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
@@ -138,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) {
   }
 
   // Run Backward
-  RunBackward(target_tensors, grad_tensors);
+  Backward(target_tensors, grad_tensors);
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
@@ -211,7 +212,7 @@ TEST(Backward, LinearNodes) {
   }
 
   // Use Empty Grad Tensor
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   // Check Output Value
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
@@ -315,7 +316,7 @@ TEST(Backward, WithAccumulation) {
     node2_ptr->AddEdges(&res2, 0);
   }
 
-  RunBackward(target_tensors, grad_tensors);
+  Backward(target_tensors, grad_tensors);
 
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
 }
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 36594f1aac8cd..8b0759c17ed37 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -71,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
   std::vector<egr::AutogradMeta*> res = {meta};
   scale_node_ptr->AddEdges(&res, 0);
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index 217055e4e9e4a..7486e711641fc 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -247,4 +247,20 @@ TEST(EagerUtils, GetGradAccumulationNode) {
   ASSERT_ANY_THROW(egr::EagerUtils::GetGradAccumulationNode(t0));
 }
 
+TEST(EagerUtils, FillZeroForEmptyGradInputs) {
+  std::vector<std::vector<paddle::experimental::Tensor>> grads = {
+      std::vector<paddle::experimental::Tensor>(1)};
+  std::vector<std::vector<GradSlotMeta>> slot_metas = {
+      std::vector<GradSlotMeta>(1)};
+
+  phi::DenseTensorMeta tensor_meta;
+  tensor_meta.dtype = paddle::experimental::DataType::FLOAT32;
+  tensor_meta.dims = {2, 4};
+  slot_metas[0][0].SetTensorMeta(tensor_meta);
+  slot_metas[0][0].SetPlace(phi::CPUPlace());
+
+  EagerUtils::FillZeroForEmptyGradInputs(&grads, slot_metas);
+  eager_test::CompareTensorWithValue<float>(grads[0][0], 0.0);
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index f7fa642ea8dd1..882695e98d109 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -86,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) {
 
   std::vector<paddle::experimental::Tensor> outs = {out};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   VLOG(7) << "Target Grad is: "
           << std::static_pointer_cast<phi::DenseTensor>(
@@ -137,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) {
 
   std::vector<paddle::experimental::Tensor> outs = {out1};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 10.0);
@@ -203,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
@@ -260,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   // leaf grad
@@ -318,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
 
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
 
   // Cross Batch Accumulation
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 60.0);
@@ -356,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
 
   std::vector<paddle::experimental::Tensor> outs = {out};
   // 4. Run Backward
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 2.0);
@@ -412,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
   // TODO(jiabin): fix this with add functor
   // 4. Run Backward
   std::vector<paddle::experimental::Tensor> outs = {out1, out2};
-  RunBackward(outs, {});
+  Backward(outs, {});
 
   // Examine Backward Grad
   eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 2a5ad53204a62..3c237b76e64b0 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -35,6 +35,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid_grad, CPU, ALL_LAYOUT);
 
 namespace egr {
 
@@ -57,7 +59,7 @@ TEST(Generated, Sigmoid) {
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
   VLOG(6) << "Runing Backward";
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   VLOG(6) << "Finish Backward";
   eager_test::CompareGradTensorWithValue<float>(tensor, 0.25);
@@ -89,7 +91,7 @@ TEST(Generated, Matmul_v2) {
   eager_test::CompareTensorWithValue<float>(output_tensor, 96);
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
   eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4);
@@ -120,7 +122,7 @@ TEST(Generated, ElementwiseAdd) {
   eager_test::CompareTensorWithValue<float>(output_tensor, 5);
 
   std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 1.0);
   eager_test::CompareGradTensorWithValue<float>(Y, 1.0);
@@ -128,6 +130,6 @@ TEST(Generated, ElementwiseAdd) {
 
 }  // namespace egr
 
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc
new file mode 100644
index 0000000000000..6b03799c48659
--- /dev/null
+++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc
@@ -0,0 +1,339 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tests/test_utils.h"
+
+#include "paddle/fluid/eager/api/all.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+namespace egr {
+
+TEST(Grad, SingleNodeEmptyGrad) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor (output)
+  paddle::experimental::Tensor output_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+
+  // Create input tensor
+  const paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+
+  {
+    // Create Scale Node
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Output_tensor set GradNode、OutRank、StopGradient propertis
+    AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&output_tensor);
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+
+    // Get autograd_meta from input tensor
+    AutogradMeta* auto_grad_meta1 =
+        EagerUtils::unsafe_autograd_meta(leaf_tensor);
+
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    // input tensor set GradNode、OutRank、StopGradient propertis
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+
+    // grad_node Add Edges
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
+    node0_ptr->AddEdges(&res, 0);
+  }
+  std::vector<paddle::experimental::Tensor> outs = {output_tensor};
+
+  // Run Grad
+  auto result = Grad(outs, {leaf_tensor}, {});
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 5.0);
+}
+
+TEST(Grad, SingleNodeCustomGrad) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+
+  std::vector<paddle::experimental::Tensor> grad_tensors;
+  // Create Grad Tensor
+  paddle::experimental::Tensor grad_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+  grad_tensors.emplace_back(std::move(grad_tensor));
+
+  paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+
+  {
+    // Create Scale Node
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Connect Tensor and Node via AutoGradMeta
+    AutogradMeta* auto_grad_meta =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
+    node0_ptr->AddEdges(&res, 0);
+  }
+
+  auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
+
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 50.0);
+}
+
+/*
+Node1
+  |
+Node0
+  |
+ { } // empty grad tensor
+*/
+TEST(Grad, LinearNodes) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Target Tensor
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+
+  paddle::experimental::Tensor leaf_tensor =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
+  {
+    // Create Node0
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta for node0
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Create Node1
+    auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node1_ptr->SetAttributes_scale(10.0 /*scale*/);
+
+    // Set grad in/out meta for node1
+    node1_ptr->SetDefaultGradInOutMeta();
+
+    // Connect Input Tensor and Node0 via AutoGradMeta
+    AutogradMeta* auto_grad_meta =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
+    // Connect Node0 -> Node1 via Edge
+    auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
+    meta0.SetSingleOutRankWithSlot(0, 0);
+    meta0.SetGradNode(node1_ptr);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
+
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+
+    auto_grad_meta1->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
+    node1_ptr->AddEdges(&res1, 0);
+  }
+
+  // Use Empty Grad Tensor
+  auto result = Grad(target_tensors, {leaf_tensor}, {});
+
+  // Check Output Value
+  eager_test::CompareTensorWithValue<float>(result[0], 50.0);
+}
+
+/*
+    Node2
+    |   |
+Node0   Node1
+  |      |
+ in0   in1
+*/
+TEST(Grad, WithAccumulation) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  std::vector<paddle::experimental::Tensor> target_tensors;
+  paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+      phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor0));
+  target_tensors.emplace_back(std::move(tensor1));
+
+  // Create Grad Tensor
+  std::vector<paddle::experimental::Tensor> grad_tensors;
+  paddle::experimental::Tensor grad_tensor0 =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
+  paddle::experimental::Tensor grad_tensor1 =
+      egr_utils_api::CreateTensorWithValue(
+          ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
+          phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+  grad_tensors.emplace_back(std::move(grad_tensor0));
+  grad_tensors.emplace_back(std::move(grad_tensor1));
+
+  paddle::experimental::Tensor leaf_tensor;
+  {
+    // Create Node0
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Create Node1
+    auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node1_ptr->SetAttributes_scale(10.0 /*scale*/);
+    node1_ptr->SetDefaultGradInOutMeta();
+    // Create Node2
+    auto node2_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node2_ptr->SetAttributes_scale(20.0 /*scale*/);
+    node2_ptr->SetDefaultGradInOutMeta();
+    // Connect Inp0 and Node0 via AutoGradMeta
+    AutogradMeta* auto_grad_meta0 =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta0->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta0->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta0->SetStopGradient(false);
+    // Connect Inp1 and Node1 via AutoGradMeta
+    AutogradMeta* auto_grad_meta1 =
+        EagerUtils::autograd_meta(&(target_tensors[1]));
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node1_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);
+
+    // Connect Node0 -> Node2 via Edge
+    auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
+    meta0.SetSingleOutRankWithSlot(0, 0);
+    meta0.SetGradNode(node2_ptr);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
+
+    // Connect Node1 -> Node2 via Edge
+    auto meta1 = egr::AutogradMeta();
+    meta1.SetStopGradient(false);
+    meta1.SetSingleOutRankWithSlot(0, 0);
+    meta1.SetGradNode(node2_ptr);
+    std::vector<egr::AutogradMeta*> res1 = {&meta1};
+    node1_ptr->AddEdges(&res1, 0);
+
+    AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr =
+        std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta2);
+
+    auto_grad_meta2->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
+
+    auto_grad_meta2->SetStopGradient(false);
+    std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
+    node2_ptr->AddEdges(&res2, 0);
+  }
+
+  auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
+
+  eager_test::CompareTensorWithValue<float>(result[0], 2500.0);
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index d546df4ed087a..2c53fc89f650e 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -132,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         leaf_tensor);  // result: 4.0*5.0 + 3.0 = 23.0
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 4.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
@@ -199,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) {
         leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
   eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
   eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
 }
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index 56813c498d241..8524be7800bfd 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -32,6 +32,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid_grad, CPU, ALL_LAYOUT);
 
 namespace egr {
 
@@ -108,7 +110,7 @@ void test_sigmoid(bool is_remove_gradient_hook) {
   }
 
   VLOG(6) << "Runing Backward";
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
   VLOG(6) << "Finish Backward";
 
   eager_test::CompareGradTensorWithValue<float>(
@@ -166,7 +168,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) {
     grad_node_tmp->RemoveGradientHook(hook_id);
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 1.0);
   eager_test::CompareGradTensorWithValue<float>(
@@ -224,7 +226,7 @@ void test_matmul(bool is_remove_gradient_hook) {
     grad_node_tmp->RemoveGradientHook(hook_id);
   }
 
-  RunBackward(target_tensors, {});
+  Backward(target_tensors, {});
 
   eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
   eager_test::CompareGradTensorWithValue<float>(
@@ -255,6 +257,6 @@ TEST(Hook_intermidiate, Matmul_v2) {
 }
 }  // namespace egr
 
-USE_OP(sigmoid);
+USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index 9967d8c36900f..277319bc700b6 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -66,10 +66,10 @@ inline void run_program_dygraph_function(
     grad_node->SetStepScope(step_scope);
 
     // Set Grad out rank as same as fwd input and set stop gradient to bwd
-    grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0);
-    grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1);
+    grad_node->SetGradOutMeta(x, /*slot id*/ 0);
+    grad_node->SetGradOutMeta(params, /*slot id*/ 1);
 
-    grad_node->SetGradInMeta(&p_autograd_outs, 0);
+    grad_node->SetGradInMeta(deref_out, 0);
     // Set Next Edges
     grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0);
     grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1);
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index d99624e493248..c83e16e9a1ec2 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
   ~GradNodeRunProgram() override = default;
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      const std::vector<std::vector<paddle::experimental::Tensor>> &grads)
-      override {
+      std::vector<std::vector<paddle::experimental::Tensor>> &grads,  // NOLINT
+      bool create_graph) override {
     VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
     PADDLE_ENFORCE_EQ(
         grads.size(), 1,
@@ -415,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase {
     // return {x_grad, details::DereferenceTensors(params_grad_ptr)};
   }
 
+  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  bool IsTensorWrappersCleared() override {
+    VLOG(6) << "Do nothing here now";
+    return false;
+  }
+
   // SetAttrMap
   void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
     attrs_ = attrs;
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 8a57d2694535e..20faae95281db 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -20,6 +20,7 @@
 
 #include "paddle/phi/api/all.h"
 #include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/tensor_meta.h"
 
 #include "paddle/fluid/framework/data_layout.h"
@@ -212,6 +213,27 @@ std::vector<std::shared_ptr<EagerVariable>> EagerUtils::CreateVars(
   return res;
 }
 
+void EagerUtils::ModifyInplaceInput(
+    const std::shared_ptr<EagerVariable>& inplace_variable,
+    paddle::experimental::Tensor* inplace_tensor) {
+  // Only modify the meta information of the inplace tensor, because
+  // EagerVariable cannot modify Tensor's meta information after inplace
+  // op (such as ``reshape``) is executed.
+  PADDLE_ENFORCE_NOT_NULL(inplace_tensor,
+                          paddle::platform::errors::Fatal(
+                              "Inplace Tensor is null and cannot be modified. "
+                              "We are tring to Modify Inplace Input from its "
+                              "shared_ptr, this error may indicate the inplace "
+                              " input is nullptr"));
+  if (phi::DenseTensor::classof(inplace_variable->GetTensorBase().get())) {
+    phi::DenseTensor* variable_dense_tensor =
+        static_cast<phi::DenseTensor*>(inplace_variable->GetTensorBase().get());
+    phi::DenseTensor* tensor_dense_tensor =
+        static_cast<phi::DenseTensor*>(inplace_tensor->impl().get());
+    tensor_dense_tensor->set_meta(variable_dense_tensor->meta());
+  }
+}
+
 std::vector<paddle::experimental::Tensor> EagerUtils::GetOutputs(
     const std::vector<std::shared_ptr<EagerVariable>>& outs) {
   std::vector<paddle::experimental::Tensor> res;
@@ -371,4 +393,28 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
   }
 }
 
+void EagerUtils::FillZeroForEmptyGradInputs(
+    std::vector<std::vector<paddle::experimental::Tensor>>* in_grads,
+    const std::vector<std::vector<GradSlotMeta>>& grad_in_metas) {
+  for (size_t i = 0; i < in_grads->size(); i++) {
+    for (size_t j = 0; j < (*in_grads)[0].size(); j++) {
+      paddle::experimental::Tensor& grad = (*in_grads)[i][j];
+      if (!grad.is_initialized()) {
+        const GradSlotMeta& grad_in_meta = grad_in_metas[i][j];
+        PADDLE_ENFORCE(
+            grad_in_meta.HasTensorMeta(),
+            paddle::platform::errors::Fatal(
+                "Unable to fill empty grad inputs due to empty GradSlotMeta"));
+
+        const auto& tensor_meta = grad_in_meta.GetTensorMeta();
+        phi::Place place = grad_in_meta.GetPlace();
+
+        auto tensor_with_zero = paddle::experimental::full(
+            phi::vectorize(tensor_meta.dims), 0.0, tensor_meta.dtype, place);
+        grad.set_impl(tensor_with_zero.impl());
+      }
+    }
+  }
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index fa5735e6f32a0..396837f101c65 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
@@ -144,6 +145,19 @@ class EagerUtils {
     iter.apply(std::forward<Args>(args)...);
   }
 
+  static void CheckInplace(const paddle::experimental::Tensor& target,
+                           const AutogradMeta* autograd_meta,
+                           bool require_any_grad) {
+    if (require_any_grad && autograd_meta) {
+      PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() &&
+                            egr::egr_utils_api::IsLeafTensor(target),
+                        false, paddle::platform::errors::InvalidArgument(
+                                   "Leaf Var (%s) that doesn't stop gradient "
+                                   "can't use inplace strategy.",
+                                   target.name()));
+    }
+  }
+
   // TensorWrapper Utils
   static paddle::experimental::Tensor RecoverTensorWrapper(
       TensorWrapper* tw, const std::shared_ptr<GradNodeBase>& grad_node);
@@ -171,6 +185,9 @@ class EagerUtils {
   static std::vector<std::shared_ptr<EagerVariable>> CreateVars(
       const size_t num);
   // Construct Tensor From var
+  static void ModifyInplaceInput(
+      const std::shared_ptr<EagerVariable>& inplace_variable,
+      paddle::experimental::Tensor* inplace_tensor);
   static std::vector<paddle::experimental::Tensor> GetOutputs(
       const std::vector<std::shared_ptr<EagerVariable>>& outs);
   static paddle::experimental::Tensor GetOutput(
@@ -200,6 +217,13 @@ class EagerUtils {
       const std::vector<paddle::experimental::Tensor>& tensors);
   static std::shared_ptr<egr::GradNodeBase> GetGradAccumulationNode(
       const paddle::experimental::Tensor& tensor);
+
+  /**
+    * Fill Zero
+    * **/
+  static void FillZeroForEmptyGradInputs(
+      std::vector<std::vector<paddle::experimental::Tensor>>* out_grads,
+      const std::vector<std::vector<GradSlotMeta>>& grad_out_metas);
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 1a4f283f511da..589d09bf81c1d 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
     return;
   }
 
+  // NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
+  if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
+    paddle::framework::TensorCopy(
+        in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
+        out);
+    return;
+  }
+
   // NOTE(yy): TransDataDevice should wait for computation of input.
   if (!platform::is_cuda_pinned_place(in.place())) {
     platform::DeviceContextPool::Instance().Get(in.place())->Wait();
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 48850d4624a14..f951b5d0f5070 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -174,10 +174,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool force_disable_gc, bool keep_kid_scopes) {
   platform::RecordBlock b(block_id);
   if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
+  auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
 #ifdef PADDLE_WITH_MKLDNN
   platform::AttachPointerHashToMKLDNNKey(this, place_);
+  platform::RegisterModelLayout(ctx->ops_, place_);
 #endif
-  auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars,
                      keep_kid_scopes);
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 17346f5fd9393..2b8b4b3ff9573 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -10,8 +10,9 @@ IF(WITH_GPU)
     nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
-    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm)
+    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table)
     nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
+    nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
new file mode 100644
index 0000000000000..235f7a226ad17
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace framework {
+struct GpuPsGraphNode {
+  int64_t node_id;
+  int neighbor_size, neighbor_offset;
+  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
+  // neighbor_size) of int64_t *neighbor_list;
+};
+
+struct GpuPsCommGraph {
+  int64_t *neighbor_list;
+  GpuPsGraphNode *node_list;
+  int neighbor_size, node_size;
+  // the size of neighbor array and graph_node_list array
+  GpuPsCommGraph()
+      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
+  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
+                 int neighbor_size_, int node_size_)
+      : neighbor_list(neighbor_list_),
+        node_list(node_list_),
+        neighbor_size(neighbor_size_),
+        node_size(node_size_) {}
+};
+
+/*
+suppose we have a graph like this
+
+0----3-----5----7
+ \   |\         |\
+ 17  8 9        1 2
+
+we save the nodes in arbitrary order,
+in this example,the order is
+[0,5,1,2,7,3,8,9,17]
+let us name this array u_id;
+we record each node's neighbors:
+0:3,17
+5:3,7
+1:7
+2:7
+7:1,2,5
+3:0,5,8,9
+8:3
+9:3
+17:0
+
+by concatenating each node's neighbor_list in the order we save the node id.
+we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
+this is the neighbor_list of GpuPsCommGraph
+given this neighbor_list and the order to save node id,
+we know,
+node 0's neighbors are in the range [0,1] of neighbor_list
+node 5's neighbors are in the range [2,3] of neighbor_list
+node 1's neighbors are in the range [4,4] of neighbor_list
+node 2:[5,5]
+node 7:[6,6]
+node 3:[9,12]
+node 8:[13,13]
+node 9:[14,14]
+node 17:[15,15]
+...
+by the above information,
+we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
+of size 9,
+where node_list[i].id = u_id[i]
+then we have:
+node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
+node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
+node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
+node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
+node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
+node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
+node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
+node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
+node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
+*/
+struct NeighborSampleResult {
+  int64_t *val;
+  int *actual_sample_size, sample_size, key_size;
+  NeighborSampleResult(int _sample_size, int _key_size)
+      : sample_size(_sample_size), key_size(_key_size) {
+    actual_sample_size = NULL;
+    val = NULL;
+  };
+  ~NeighborSampleResult() {
+    if (val != NULL) cudaFree(val);
+    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
+  }
+};
+
+struct NodeQueryResult {
+  int64_t *val;
+  int actual_sample_size;
+  NodeQueryResult() {
+    val = NULL;
+    actual_sample_size = 0;
+  };
+  ~NodeQueryResult() {
+    if (val != NULL) cudaFree(val);
+  }
+};
+}
+};
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index a6508bf96c00f..3d1599a76e8eb 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -13,115 +13,27 @@
 // limitations under the License.
 
 #pragma once
+#include <thrust/host_vector.h>
 #include "heter_comm.h"
+#include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 #include "paddle/fluid/platform/enforce.h"
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace framework {
-struct GpuPsGraphNode {
-  int64_t node_id;
-  int neighbor_size, neighbor_offset;
-  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
-  // neighbor_size) of int64_t *neighbor_list;
-};
-
-struct GpuPsCommGraph {
-  int64_t *neighbor_list;
-  GpuPsGraphNode *node_list;
-  int neighbor_size, node_size;
-  // the size of neighbor array and graph_node_list array
-  GpuPsCommGraph()
-      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
-  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
-                 int neighbor_size_, int node_size_)
-      : neighbor_list(neighbor_list_),
-        node_list(node_list_),
-        neighbor_size(neighbor_size_),
-        node_size(node_size_) {}
-};
-
-/*
-suppose we have a graph like this
 
-0----3-----5----7
- \   |\         |\
- 17  8 9        1 2
-
-we save the nodes in arbitrary order,
-in this example,the order is
-[0,5,1,2,7,3,8,9,17]
-let us name this array u_id;
-we record each node's neighbors:
-0:3,17
-5:3,7
-1:7
-2:7
-7:1,2,5
-3:0,5,8,9
-8:3
-9:3
-17:0
-
-by concatenating each node's neighbor_list in the order we save the node id.
-we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
-this is the neighbor_list of GpuPsCommGraph
-given this neighbor_list and the order to save node id,
-we know,
-node 0's neighbors are in the range [0,1] of neighbor_list
-node 5's neighbors are in the range [2,3] of neighbor_list
-node 1's neighbors are in the range [4,4] of neighbor_list
-node 2:[5,5]
-node 7:[6,6]
-node 3:[9,12]
-node 8:[13,13]
-node 9:[14,14]
-node 17:[15,15]
-...
-by the above information,
-we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
-of size 9,
-where node_list[i].id = u_id[i]
-then we have:
-node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
-node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
-node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
-node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
-node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
-node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
-node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
-node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
-node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
-*/
-struct NeighborSampleResult {
-  int64_t *val;
-  int *actual_sample_size, sample_size, key_size;
-  NeighborSampleResult(int _sample_size, int _key_size)
-      : sample_size(_sample_size), key_size(_key_size) {
-    actual_sample_size = NULL;
-    val = NULL;
-  };
-  ~NeighborSampleResult() {
-    if (val != NULL) cudaFree(val);
-    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
-  }
-};
-
-struct NodeQueryResult {
-  int64_t *val;
-  int actual_sample_size;
-  NodeQueryResult() {
-    val = NULL;
-    actual_sample_size = 0;
-  };
-  ~NodeQueryResult() {
-    if (val != NULL) cudaFree(val);
-  }
-};
 class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
  public:
   GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource)
       : HeterComm<int64_t, int, int>(1, resource) {
     load_factor_ = 0.25;
+    rw_lock.reset(new pthread_rwlock_t());
+    cpu_table_status = -1;
+  }
+  ~GpuPsGraphTable() {
+    if (cpu_table_status != -1) {
+      end_graph_sampling();
+    }
   }
   void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
   NodeQueryResult *graph_node_sample(int gpu_id, int sample_size);
@@ -129,14 +41,26 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
                                               int sample_size, int len);
   NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
   void clear_graph_info();
-  void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num,
-                                                 int sample_size, int *h_left,
-                                                 int *h_right,
-                                                 int64_t *src_sample_res,
-                                                 int *actual_sample_size);
+  void move_neighbor_sample_result_to_source_gpu(
+      int gpu_id, int gpu_num, int *h_left, int *h_right,
+      int64_t *src_sample_res, thrust::host_vector<int> &total_sample_size);
+  void move_neighbor_sample_size_to_source_gpu(int gpu_id, int gpu_num,
+                                               int *h_left, int *h_right,
+                                               int *actual_sample_size,
+                                               int *total_sample_size);
+  int init_cpu_table(const paddle::distributed::GraphParameter &graph);
+  int load(const std::string &path, const std::string &param);
+  virtual int32_t end_graph_sampling() {
+    return cpu_graph_table->end_graph_sampling();
+  }
 
  private:
   std::vector<GpuPsCommGraph> gpu_graph_list;
+  std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table;
+  std::shared_ptr<pthread_rwlock_t> rw_lock;
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  int cpu_table_status;
 };
 }
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
index 839c7e5468c6c..acd3f0a290d0b 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -13,9 +13,23 @@
 // limitations under the License.
 
 #pragma once
+
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
 #ifdef PADDLE_WITH_HETERPS
+//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 namespace paddle {
 namespace framework {
+
+constexpr int WARP_SIZE = 32;
+
 /*
 comment 0
 this kernel just serves as an example of how to sample nodes' neighbors.
@@ -28,30 +42,116 @@ sample_size;
 
 */
 
-__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
-                                        int* actual_size,
-                                        int64_t* sample_result, int sample_size,
-                                        int len) {
-  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
+struct MaxFunctor {
+  int sample_size;
+  HOSTDEVICE explicit inline MaxFunctor(int sample_size) {
+    this->sample_size = sample_size;
+  }
+  HOSTDEVICE inline int operator()(int x) const {
+    if (x > sample_size) {
+      return sample_size;
+    }
+    return x;
+  }
+};
+
+struct DegreeFunctor {
+  GpuPsCommGraph graph;
+  HOSTDEVICE explicit inline DegreeFunctor(GpuPsCommGraph graph) {
+    this->graph = graph;
+  }
+  HOSTDEVICE inline int operator()(int i) const {
+    return graph.node_list[i].neighbor_size;
+  }
+};
+
+template <int BLOCK_WARPS, int TILE_SIZE>
+__global__ void neighbor_sample(const uint64_t rand_seed, GpuPsCommGraph graph,
+                                int sample_size, int* index, int len,
+                                int64_t* sample_result, int* output_idx,
+                                int* output_offset) {
+  assert(blockDim.x == WARP_SIZE);
+  assert(blockDim.y == BLOCK_WARPS);
+
+  int i = blockIdx.x * TILE_SIZE + threadIdx.y;
+  const int last_idx = min(static_cast<int>(blockIdx.x + 1) * TILE_SIZE, len);
+  curandState rng;
+  curand_init(rand_seed * gridDim.x + blockIdx.x,
+              threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng);
+
+  while (i < last_idx) {
     auto node_index = index[i];
-    actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size
-                         ? graph.node_list[node_index].neighbor_size
-                         : sample_size;
-    int offset = graph.node_list[node_index].neighbor_offset;
-    for (int j = 0; j < actual_size[i]; j++) {
-      sample_result[sample_size * i + j] = graph.neighbor_list[offset + j];
+    int degree = graph.node_list[node_index].neighbor_size;
+    const int offset = graph.node_list[node_index].neighbor_offset;
+    int output_start = output_offset[i];
+
+    if (degree <= sample_size) {
+      // Just copy
+      for (int j = threadIdx.x; j < degree; j += WARP_SIZE) {
+        sample_result[output_start + j] = graph.neighbor_list[offset + j];
+      }
+    } else {
+      for (int j = threadIdx.x; j < degree; j += WARP_SIZE) {
+        output_idx[output_start + j] = j;
+      }
+
+      __syncwarp();
+
+      for (int j = sample_size + threadIdx.x; j < degree; j += WARP_SIZE) {
+        const int num = curand(&rng) % (j + 1);
+        if (num < sample_size) {
+          atomicMax(
+              reinterpret_cast<unsigned int*>(output_idx + output_start + num),
+              static_cast<unsigned int>(j));
+        }
+      }
+
+      __syncwarp();
+
+      for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) {
+        const int perm_idx = output_idx[output_start + j] + offset;
+        sample_result[output_start + j] = graph.neighbor_list[perm_idx];
+      }
     }
+
+    i += BLOCK_WARPS;
   }
 }
 
+int GpuPsGraphTable::init_cpu_table(
+    const paddle::distributed::GraphParameter& graph) {
+  cpu_graph_table.reset(new paddle::distributed::GraphTable);
+  cpu_table_status = cpu_graph_table->initialize(graph);
+  if (cpu_table_status != 0) return cpu_table_status;
+  std::function<void(std::vector<GpuPsCommGraph>&)> callback =
+      [this](std::vector<GpuPsCommGraph>& res) {
+        pthread_rwlock_wrlock(this->rw_lock.get());
+        this->clear_graph_info();
+        this->build_graph_from_cpu(res);
+        pthread_rwlock_unlock(this->rw_lock.get());
+        cv_.notify_one();
+      };
+  cpu_graph_table->set_graph_sample_callback(callback);
+  return cpu_table_status;
+}
+
+int GpuPsGraphTable::load(const std::string& path, const std::string& param) {
+  int status = cpu_graph_table->load(path, param);
+  if (status != 0) {
+    return status;
+  }
+  std::unique_lock<std::mutex> lock(mutex_);
+  cpu_graph_table->start_graph_sampling();
+  cv_.wait(lock);
+  return 0;
+}
 /*
  comment 1
 
  gpu i triggers a neighbor_sample task,
  when this task is done,
  this function is called to move the sample result on other gpu back
- to gup i and aggragate the result.
+ to gpu i and aggragate the result.
  the sample_result is saved on src_sample_res and the actual sample size for
  each node is saved on actual_sample_size.
  the number of actual sample_result for
@@ -68,9 +168,50 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
  that's what fill_dvals does.
 
 */
+void GpuPsGraphTable::move_neighbor_sample_size_to_source_gpu(
+    int gpu_id, int gpu_num, int* h_left, int* h_right, int* actual_sample_size,
+    int* total_sample_size) {
+  // This function copyed actual_sample_size to source_gpu,
+  // and calculate total_sample_size of each gpu sample number.
+  for (int i = 0; i < gpu_num; i++) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    auto shard_len = h_right[i] - h_left[i] + 1;
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
+                    node.val_storage + sizeof(int) * shard_len,
+                    sizeof(int) * shard_len, cudaMemcpyDefault,
+                    node.out_stream);
+  }
+  for (int i = 0; i < gpu_num; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      total_sample_size[i] = 0;
+      continue;
+    }
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaStreamSynchronize(node.out_stream);
+
+    auto shard_len = h_right[i] - h_left[i] + 1;
+    thrust::device_vector<int> t_actual_sample_size(shard_len);
+    thrust::copy(actual_sample_size + h_left[i],
+                 actual_sample_size + h_left[i] + shard_len,
+                 t_actual_sample_size.begin());
+    total_sample_size[i] = thrust::reduce(t_actual_sample_size.begin(),
+                                          t_actual_sample_size.end());
+  }
+}
+
 void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
-    int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right,
-    int64_t* src_sample_res, int* actual_sample_size) {
+    int gpu_id, int gpu_num, int* h_left, int* h_right, int64_t* src_sample_res,
+    thrust::host_vector<int>& total_sample_size) {
+  /*
+  if total_sample_size is [4, 5, 1, 6],
+  then cumsum_total_sample_size is [0, 4, 9, 10];
+  */
+  thrust::host_vector<int> cumsum_total_sample_size(gpu_num, 0);
+  thrust::exclusive_scan(total_sample_size.begin(), total_sample_size.end(),
+                         cumsum_total_sample_size.begin(), 0);
   for (int i = 0; i < gpu_num; i++) {
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
@@ -80,14 +221,10 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
     // auto& node = path_[gpu_id][i].nodes_[cur_step];
     auto& node = path_[gpu_id][i].nodes_.front();
     cudaMemcpyAsync(
-        reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
+        reinterpret_cast<char*>(src_sample_res + cumsum_total_sample_size[i]),
         node.val_storage + sizeof(int64_t) * shard_len,
-        node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault,
+        sizeof(int64_t) * total_sample_size[i], cudaMemcpyDefault,
         node.out_stream);
-    cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
-                    node.val_storage + sizeof(int) * shard_len,
-                    sizeof(int) * shard_len, cudaMemcpyDefault,
-                    node.out_stream);
   }
   for (int i = 0; i < gpu_num; ++i) {
     if (h_left[i] == -1 || h_right[i] == -1) {
@@ -102,17 +239,35 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
 TODO:
 how to optimize it to eliminate the for loop
 */
-__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals,
-                             int* d_shard_actual_sample_size,
-                             int* d_actual_sample_size, int* idx,
-                             int sample_size, int len) {
+__global__ void fill_dvalues_actual_sample_size(int* d_shard_actual_sample_size,
+                                                int* d_actual_sample_size,
+                                                int* idx, int len) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
     d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i];
-    // d_vals[idx[i]] = d_shard_vals[i];
-    for (int j = 0; j < sample_size; j++) {
-      d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j];
+  }
+}
+
+template <int BLOCK_WARPS, int TILE_SIZE>
+__global__ void fill_dvalues_sample_result(int64_t* d_shard_vals,
+                                           int64_t* d_vals,
+                                           int* d_actual_sample_size, int* idx,
+                                           int* offset, int* d_offset,
+                                           int len) {
+  assert(blockDim.x == WARP_SIZE);
+  assert(blockDim.y == BLOCK_WARPS);
+
+  int i = blockIdx.x * TILE_SIZE + threadIdx.y;
+  const int last_idx = min(static_cast<int>(blockIdx.x + 1) * TILE_SIZE, len);
+  while (i < last_idx) {
+    const int sample_size = d_actual_sample_size[idx[i]];
+    for (int j = threadIdx.x; j < sample_size; j += WARP_SIZE) {
+      d_vals[offset[idx[i]] + j] = d_shard_vals[d_offset[i] + j];
     }
+#ifdef PADDLE_WITH_CUDA
+    __syncwarp();
+#endif
+    i += BLOCK_WARPS;
   }
 }
 
@@ -226,14 +381,12 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     h_left = [0,5],h_right = [4,8]
 
   */
+
   NeighborSampleResult* result = new NeighborSampleResult(sample_size, len);
   if (len == 0) {
     return result;
   }
-  cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t));
-  cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
-  int* actual_sample_size = result->actual_sample_size;
-  int64_t* val = result->val;
+
   int total_gpu = resource_->total_gpu();
   int dev_id = resource_->dev_id(gpu_id);
   platform::CUDAPlace place = platform::CUDAPlace(dev_id);
@@ -258,11 +411,6 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
 
   auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
   int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t));
-  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
-  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
-  int* d_shard_actual_sample_size_ptr =
-      reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
 
   split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
 
@@ -302,6 +450,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     of alloc_mem_i, actual_sample_size_of_x equals ((int
    *)alloc_mem_i)[shard_len + x]
     */
+
     create_storage(gpu_id, i, shard_len * sizeof(int64_t),
                    shard_len * (1 + sample_size) * sizeof(int64_t));
   }
@@ -322,6 +471,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, gpu_id));
   }
+
   for (int i = 0; i < total_gpu; ++i) {
     if (h_left[i] == -1) {
       continue;
@@ -335,10 +485,42 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     int* res_array = reinterpret_cast<int*>(node.val_storage);
     int* actual_size_array = res_array + shard_len;
     int64_t* sample_array = (int64_t*)(res_array + shard_len * 2);
-    neighbor_sample_example<<<grid_size, block_size_, 0,
-                              resource_->remote_stream(i, gpu_id)>>>(
-        graph, res_array, actual_size_array, sample_array, sample_size,
-        shard_len);
+
+    // 1. get actual_size_array.
+    // 2. get sum of actual_size.
+    // 3. get offset ptr
+    thrust::device_vector<int> t_res_array(shard_len);
+    thrust::copy(res_array, res_array + shard_len, t_res_array.begin());
+    thrust::device_vector<int> t_actual_size_array(shard_len);
+    thrust::transform(t_res_array.begin(), t_res_array.end(),
+                      t_actual_size_array.begin(), DegreeFunctor(graph));
+
+    if (sample_size >= 0) {
+      thrust::transform(t_actual_size_array.begin(), t_actual_size_array.end(),
+                        t_actual_size_array.begin(), MaxFunctor(sample_size));
+    }
+
+    thrust::copy(t_actual_size_array.begin(), t_actual_size_array.end(),
+                 actual_size_array);
+
+    int total_sample_sum =
+        thrust::reduce(t_actual_size_array.begin(), t_actual_size_array.end());
+
+    thrust::device_vector<int> output_idx(total_sample_sum);
+    thrust::device_vector<int> output_offset(shard_len);
+    thrust::exclusive_scan(t_actual_size_array.begin(),
+                           t_actual_size_array.end(), output_offset.begin(), 0);
+
+    constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
+    constexpr int TILE_SIZE = BLOCK_WARPS * 16;
+    const dim3 block_(WARP_SIZE, BLOCK_WARPS);
+    const dim3 grid_((shard_len + TILE_SIZE - 1) / TILE_SIZE);
+    neighbor_sample<
+        BLOCK_WARPS,
+        TILE_SIZE><<<grid_, block_, 0, resource_->remote_stream(i, gpu_id)>>>(
+        0, graph, sample_size, res_array, shard_len, sample_array,
+        thrust::raw_pointer_cast(output_idx.data()),
+        thrust::raw_pointer_cast(output_offset.data()));
   }
 
   for (int i = 0; i < total_gpu; ++i) {
@@ -349,13 +531,56 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     tables_[i]->rwlock_->UNLock();
   }
   // walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr);
-  move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
-                                            h_left, h_right, d_shard_vals_ptr,
-                                            d_shard_actual_sample_size_ptr);
 
-  fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
-      d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size,
-      d_idx_ptr, sample_size, len);
+  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
+  int* d_shard_actual_sample_size_ptr =
+      reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
+  // Store total sample number of each gpu.
+  thrust::host_vector<int> d_shard_total_sample_size(total_gpu, 0);
+  move_neighbor_sample_size_to_source_gpu(
+      gpu_id, total_gpu, h_left, h_right, d_shard_actual_sample_size_ptr,
+      thrust::raw_pointer_cast(d_shard_total_sample_size.data()));
+  int allocate_sample_num = 0;
+  for (int i = 0; i < total_gpu; ++i) {
+    allocate_sample_num += d_shard_total_sample_size[i];
+  }
+  auto d_shard_vals =
+      memory::Alloc(place, allocate_sample_num * sizeof(int64_t));
+  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
+  move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, h_left, h_right,
+                                            d_shard_vals_ptr,
+                                            d_shard_total_sample_size);
+
+  cudaMalloc((void**)&result->val, allocate_sample_num * sizeof(int64_t));
+  cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
+  cudaMalloc((void**)&result->offset, len * sizeof(int));
+  int64_t* val = result->val;
+  int* actual_sample_size = result->actual_sample_size;
+  int* offset = result->offset;
+
+  fill_dvalues_actual_sample_size<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_actual_sample_size_ptr, actual_sample_size, d_idx_ptr, len);
+  thrust::device_vector<int> t_actual_sample_size(len);
+  thrust::copy(actual_sample_size, actual_sample_size + len,
+               t_actual_sample_size.begin());
+  thrust::exclusive_scan(t_actual_sample_size.begin(),
+                         t_actual_sample_size.end(), offset, 0);
+  int* d_offset;
+  cudaMalloc(&d_offset, len * sizeof(int));
+  thrust::copy(d_shard_actual_sample_size_ptr,
+               d_shard_actual_sample_size_ptr + len,
+               t_actual_sample_size.begin());
+  thrust::exclusive_scan(t_actual_sample_size.begin(),
+                         t_actual_sample_size.end(), d_offset, 0);
+  constexpr int BLOCK_WARPS_ = 128 / WARP_SIZE;
+  constexpr int TILE_SIZE_ = BLOCK_WARPS_ * 16;
+  const dim3 block__(WARP_SIZE, BLOCK_WARPS_);
+  const dim3 grid__((len + TILE_SIZE_ - 1) / TILE_SIZE_);
+  fill_dvalues_sample_result<BLOCK_WARPS_,
+                             TILE_SIZE_><<<grid__, block__, 0, stream>>>(
+      d_shard_vals_ptr, val, actual_sample_size, d_idx_ptr, offset, d_offset,
+      len);
+
   cudaStreamSynchronize(stream);
   for (int i = 0; i < total_gpu; ++i) {
     int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
@@ -364,6 +589,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     }
     destroy_storage(gpu_id, i);
   }
+  cudaFree(d_offset);
   return result;
 }
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 2cf702969f99a..f85ed330dc8ea 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
+//#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include <queue>
 
 namespace paddle {
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
new file mode 100644
index 0000000000000..8c7ea10b26565
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+using namespace paddle::framework;
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+char edge_file_name[] = "edges.txt";
+TEST(TEST_FLEET, graph_sample) {
+  std::vector<std::string> edges;
+  int gpu_count = 3;
+  std::vector<int> dev_ids;
+  dev_ids.push_back(0);
+  dev_ids.push_back(1);
+  dev_ids.push_back(2);
+
+  std::shared_ptr<HeterPsResource> resource =
+      std::make_shared<HeterPsResource>(dev_ids);
+  resource->enable_p2p();
+  GpuPsGraphTable g(resource);
+  int node_count = 10;
+  std::vector<std::vector<int64_t>> neighbors(node_count);
+  int ind = 0;
+  int64_t node_id = 0;
+  // std::vector<GpuPsCommGraph> graph_list(gpu_count);
+  while (ind < node_count) {
+    int neighbor_size = ind + 1;
+    while (neighbor_size--) {
+      edges.push_back(std::to_string(ind) + "\t" + std::to_string(node_id) +
+                      "\t1.0");
+      node_id++;
+    }
+    ind++;
+  }
+  /*
+  gpu 0:
+  0,3,6,9
+  gpu 1:
+  1,4,7
+  gpu 2:
+  2,5,8
+
+  query(2,6) returns nodes [6,9,1,4,7,2]
+  */
+  ::paddle::distributed::GraphParameter table_proto;
+  table_proto.set_gpups_mode(true);
+  table_proto.set_gpups_mode_shard_num(127);
+  table_proto.set_gpu_num(3);
+  table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler");
+  table_proto.set_gpups_graph_sample_args("5,5,1,1");
+  prepare_file(edge_file_name, edges);
+  g.init_cpu_table(table_proto);
+  g.load(std::string(edge_file_name), std::string("e>"));
+  /*
+   node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x]
+   so node 6's neighbors are [21,22...,27]
+   node 7's neighbors are [28,29,..35]
+    node 0's neighbors are [0]
+   query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23]
+   6 --index-->2
+   0 --index--->0
+   7 --index-->2
+  */
+  int64_t cpu_key[3] = {7, 0, 6};
+  void *key;
+  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
+  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
+  auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
+  int64_t *res = new int64_t[9];
+  cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost);
+  std::sort(res, res + 3);
+  std::sort(res + 6, res + 9);
+  int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
+  for (int i = 0; i < 9; i++) {
+    if (expected_sample_val[i] != -1) {
+      ASSERT_EQ(res[i], expected_sample_val[i]);
+    }
+  }
+  delete[] res;
+  delete neighbor_sample_res;
+}
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
index 697e0ba2cdf34..06c7026eb51ca 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
@@ -94,19 +94,44 @@ TEST(TEST_FLEET, graph_comm) {
    0 --index--->0
    7 --index-->2
   */
+
   int64_t cpu_key[3] = {7, 0, 6};
   void *key;
   cudaMalloc((void **)&key, 3 * sizeof(int64_t));
   cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
   auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
-  res = new int64_t[9];
-  cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost);
-  int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
-  for (int i = 0; i < 9; i++) {
-    if (expected_sample_val[i] != -1) {
-      ASSERT_EQ(res[i], expected_sample_val[i]);
+  res = new int64_t[7];
+  cudaMemcpy(res, neighbor_sample_res->val, 56, cudaMemcpyDeviceToHost);
+  int *actual_sample_size = new int[3];
+  cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size, 12,
+             cudaMemcpyDeviceToHost);  // 3, 1, 3
+  int *cumsum_sample_size = new int[3];
+  cudaMemcpy(cumsum_sample_size, neighbor_sample_res->offset, 12,
+             cudaMemcpyDeviceToHost);  // 0, 3, 4
+
+  std::vector<std::vector<int64_t>> neighbors_;
+  std::vector<int64_t> neighbors_7 = {28, 29, 30, 31, 32, 33, 34, 35};
+  std::vector<int64_t> neighbors_0 = {0};
+  std::vector<int64_t> neighbors_6 = {21, 22, 23, 24, 25, 26, 27};
+  neighbors_.push_back(neighbors_7);
+  neighbors_.push_back(neighbors_0);
+  neighbors_.push_back(neighbors_6);
+  for (int i = 0; i < 3; i++) {
+    for (int j = cumsum_sample_size[i];
+         j < cumsum_sample_size[i] + actual_sample_size[i]; j++) {
+      bool flag = false;
+      for (int k = 0; k < neighbors_[i].size(); k++) {
+        if (res[j] == neighbors_[i][k]) {
+          flag = true;
+          break;
+        }
+      }
+      ASSERT_EQ(flag, true);
     }
   }
+
   delete[] res;
+  delete[] actual_sample_size;
+  delete[] cumsum_sample_size;
   delete neighbor_sample_res;
 }
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 31a30f72e3aa6..432e57107e84d 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -148,7 +148,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       t.join();
     }
     timeline.Pause();
-    VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
+    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
   } else {
     CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
     VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
@@ -182,7 +182,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       t.join();
     }
     timeline.Pause();
-    VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
+    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
   }
 
   timeline.Start();
@@ -300,7 +300,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     int32_t cnt = 0;
     while (true) {
       auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
+          i, reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
           local_keys[i].data(), key_size);
       bool flag = true;
 
@@ -378,8 +378,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     int32_t cnt = 0;
     while (true) {
       auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          reinterpret_cast<char**>(local_dim_ptr[i][j].data()), this->table_id_,
-          local_dim_keys[i][j].data(), key_size);
+          i, reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
+          this->table_id_, local_dim_keys[i][j].data(), key_size);
       bool flag = true;
 
       tt.wait();
@@ -431,7 +431,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     t.join();
   }
   timeline.Pause();
-  VLOG(1) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec()
+  VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec()
           << " seconds.";
   if (multi_node_) {
     auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
@@ -603,7 +603,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     t.join();
   }
   timeline.Pause();
-  VLOG(1) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
+  VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
           << " seconds.";
 }
 
@@ -746,7 +746,7 @@ void PSGPUWrapper::BeginPass() {
         "[BeginPass] after build_task, current task is not null."));
   }
 
-  VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
+  VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
 }
 
 void PSGPUWrapper::EndPass() {
@@ -769,7 +769,7 @@ void PSGPUWrapper::EndPass() {
   current_task_ = nullptr;
   gpu_free_channel_->Put(current_task_);
   timer.Pause();
-  VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
+  VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
 }
 
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 5119c30690691..2babecc6ddf93 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -78,6 +78,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
     return var_types[0] == proto::VarType::SELECTED_ROWS;
   }
 
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    auto var_types = ctx_.GetInputsVarType(name);
+    return var_types[0] == proto::VarType::LOD_TENSOR_ARRAY;
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     auto var_types = ctx_.GetOutputsVarType(name);
     return var_types[0] == proto::VarType::LOD_TENSOR;
@@ -125,9 +130,14 @@ class CompatMetaTensor : public phi::MetaTensor {
         return var->Get<phi::DenseTensor>().dims();
       } else if (var->IsType<phi::SelectedRows>()) {
         return var->Get<phi::SelectedRows>().dims();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // use tensor array size as dims
+        auto& tensor_array = var->Get<framework::LoDTensorArray>();
+        return phi::make_ddim({static_cast<int64_t>(tensor_array.size())});
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
-            "Currently, only can get dims from DenseTensor or SelectedRows."));
+            "Currently, only can get dims from DenseTensor or SelectedRows or "
+            "DenseTensorArray."));
       }
     } else {
       auto* var = BOOST_GET_CONST(VarDesc*, var_);
@@ -144,6 +154,10 @@ class CompatMetaTensor : public phi::MetaTensor {
         return var->Get<phi::DenseTensor>().dtype();
       } else if (var->IsType<phi::SelectedRows>()) {
         return var->Get<phi::SelectedRows>().dtype();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported get dtype from LoDTensorArray now
+        return phi::DataType::UNDEFINED;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can get dtype from DenseTensor or SelectedRows."));
@@ -157,7 +171,19 @@ class CompatMetaTensor : public phi::MetaTensor {
   DataLayout layout() const override {
     if (is_runtime_) {
       auto* var = BOOST_GET_CONST(Variable*, var_);
-      return var->Get<LoDTensor>().layout();
+      if (var->IsType<phi::DenseTensor>()) {
+        return var->Get<phi::DenseTensor>().layout();
+      } else if (var->IsType<phi::SelectedRows>()) {
+        return var->Get<phi::SelectedRows>().layout();
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported get layout from LoDTensorArray now
+        return phi::DataLayout::UNDEFINED;
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can get layout from DenseTensor or "
+            "SelectedRows."));
+      }
     } else {
       // NOTE(chenweihang): do nothing
       // Unsupported get layout for VarDesc now
@@ -174,6 +200,16 @@ class CompatMetaTensor : public phi::MetaTensor {
       } else if (var->IsType<phi::SelectedRows>()) {
         auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
         phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        auto* tensor_array = var->GetMutable<framework::LoDTensorArray>();
+        // Note: Here I want enforce `tensor_array->size() == 0UL`, because
+        // inplace using on LoDTensorArray is dangerous, but the unittest
+        // `test_list` contains this behavior
+        PADDLE_ENFORCE_EQ(dims.size(), 1UL,
+                          platform::errors::InvalidArgument(
+                              "LoDTensorArray can only have one dimension."));
+        // only set the array size for LoDTensorArray input
+        tensor_array->resize(dims[0]);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can set dims from DenseTensor or SelectedRows."));
@@ -193,6 +229,9 @@ class CompatMetaTensor : public phi::MetaTensor {
       } else if (var->IsType<phi::SelectedRows>()) {
         auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
         phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported set dtype for LoDTensorArray now
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Currently, only can set dtype from DenseTensor or SelectedRows."));
@@ -206,10 +245,20 @@ class CompatMetaTensor : public phi::MetaTensor {
   void set_layout(DataLayout layout) override {
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(tensor))
-          ->layout = layout;
+      if (var->IsType<phi::DenseTensor>()) {
+        auto* tensor = var->GetMutable<phi::DenseTensor>();
+        phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
+      } else if (var->IsType<phi::SelectedRows>()) {
+        auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
+        phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // NOTE(chenweihang): do nothing
+        // Unsupported set dtype for LoDTensorArray now
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can set layout from DenseTensor or "
+            "SelectedRows."));
+      }
     } else {
       // NOTE(chenweihang): do nothing
       // Unsupported set layout for VarDesc now
@@ -251,9 +300,7 @@ class CompatMetaTensor : public phi::MetaTensor {
   void share_meta(const MetaTensor& meta_tensor) override {
     share_dims(meta_tensor);
     set_dtype(meta_tensor.dtype());
-    // VarDesc doesn't contains layout, so we cannot share layout
-    // set_layout(meta_tensor.layout());
-
+    set_layout(meta_tensor.layout());
     // special case: share lod of LoDTensor
     share_lod(meta_tensor);
   }
@@ -442,6 +489,51 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
               attr_name, infershape_input.size()));
         }
       }
+    } else if (attr_defs[i].type_index ==
+               std::type_index(typeid(std::vector<phi::Scalar>))) {
+      auto& attr = attr_reader.GetAttr(attr_name);
+      if (std::type_index(attr.type()) ==
+          std::type_index(typeid(std::vector<int32_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<int64_t>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<float>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::vector<double>))) {
+        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+        std::vector<phi::Scalar> scalar_list;
+        scalar_list.reserve(vec.size());
+        for (const auto& val : vec) {
+          scalar_list.emplace_back(val);
+        }
+        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported cast op attribute `%s` to vector<Scalar> when "
+            "construct InferMetaContext.",
+            attr_names[i]));
+      }
     } else if (ctx->HasAttr(attr_name)) {
       // Emplace Back Attr according to the type of attr.
       auto& attr = attr_reader.GetAttr(attr_name);
@@ -500,8 +592,22 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             "Unsupported attribute type is received when call "
             "InferShapeFunctor."));
       }
-    } else {
-      // do nothing
+    } else if (ctx->HasInput(attr_name)) {
+      // convert from data
+      if (attr_defs[i].type_index == std::type_index(typeid(int32_t))) {
+        if (ctx->IsRuntime()) {
+          const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name);
+          auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]);
+          auto val = experimental::MakePhiScalarFromVar(*var_temp);
+          int32_t val_int = val.template to<int32_t>();
+          infer_meta_context.EmplaceBackAttr(val_int);
+        } else {
+          infer_meta_context.EmplaceBackAttr(-1);
+        }
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Get value from variable only support int yet"));
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 623c8a048c241..7aaaef712a6e9 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -97,6 +97,7 @@ pass_library(layer_norm_fuse_pass inference)
 pass_library(add_support_int8_pass inference)
 pass_library(matmul_scale_fuse_pass inference)
 pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
+pass_library(mixed_precision_configure_pass inference)
 pass_library(generate_pass DEPS pass_desc_proto)
 target_link_libraries(generate_pass pass_desc_proto)
 
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 036fde8fac6d9..f5f6f3ecb855c 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -95,6 +95,7 @@ std::map<std::string, std::vector<ir::Node *>> Graph::InitFromBlock(
   std::unordered_map<std::string, std::pair<VarDesc *, int>>
       name_to_desc_block_id;
 
+  block_id_ = block.ID();
   const BlockDesc *block_var_visible = &block;
   while (block_var_visible != nullptr) {
     for (auto *var : block_var_visible->AllVars()) {
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 21e743e3587d8..10645f08dc3ba 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -230,6 +230,7 @@ class Graph {
     auto *x =
         AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
@@ -245,6 +246,7 @@ class Graph {
                      "The OpDesc used to create operator node is null."));
     auto *x = AddNode(new ir::Node(op_desc));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
@@ -263,6 +265,7 @@ class Graph {
         num_node_created_);
     auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
@@ -276,6 +279,7 @@ class Graph {
     }
     auto *x = AddNode(new ir::Node(name, type, block_id_));
     x->SetId(num_node_created_++);
+    x->SetGraphId(block_id_);
     return x;
   }
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 18068e22b7f3c..164a13d1560f4 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2052,18 +2052,19 @@ PDNode *patterns::Pool::operator()() {
   return output_var;
 }
 
-PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
-  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
-                                ->assert_is_op("elementwise_add");
-
-  x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
-  y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
-  auto out_var = pattern->NewNode(elementwise_add_out_repr())
+PDNode *patterns::Elementwise::operator()(PDNode *x_var, PDNode *y_var,
+                                          const std::string elementwise_type) {
+  auto elementwise_op =
+      pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type);
+
+  x_var->AsInput()->assert_is_op_input(elementwise_type, "X");
+  y_var->AsInput()->assert_is_op_input(elementwise_type, "Y");
+  auto out_var = pattern->NewNode(elementwise_out_repr())
                      ->AsOutput()
-                     ->assert_is_op_output("elementwise_add", "Out");
+                     ->assert_is_op_output(elementwise_type, "Out");
 
-  elementwise_add_op->LinksFrom({x_var, y_var});
-  elementwise_add_op->LinksTo({out_var});
+  elementwise_op->LinksFrom({x_var, y_var});
+  elementwise_op->LinksTo({out_var});
 
   return out_var;
 }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 062d2f9dedce6..17c70ace301d3 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1016,20 +1016,20 @@ struct Pool : public PatternBase {
   PATTERN_DECL_NODE(pool_output);
 };
 
-// ElementwiseAdd used in residual connections.
-// y_var is used and convolution output.
-// The operator is removed, when residual
-// connection fusion is on.
-struct ElementwiseAdd : public PatternBase {
-  ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "elementwise_add") {}
-
-  PDNode* operator()(PDNode* x_var, PDNode* y_var);
-
-  PATTERN_DECL_NODE(elementwise_add_op);
-  PATTERN_DECL_NODE(elementwise_add_x);
-  PATTERN_DECL_NODE(elementwise_add_y);
-  PATTERN_DECL_NODE(elementwise_add_out);
+// Elementwise ops
+// Forward pass for element-wise operators (add, mul)
+// elementwise_mul_out is the result of the operator
+struct Elementwise : public PatternBase {
+  Elementwise(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elementwise") {}
+
+  PDNode* operator()(PDNode* x_var, PDNode* y_var,
+                     const std::string elementwise_type);
+
+  PATTERN_DECL_NODE(elementwise_op);
+  PATTERN_DECL_NODE(elementwise_x);
+  PATTERN_DECL_NODE(elementwise_y);
+  PATTERN_DECL_NODE(elementwise_out);
 };
 
 // Transpose op
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
index 9fe50deaf2d72..7cdb7a8854aad 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -25,14 +25,14 @@ std::set<std::string> ignored_ops = {
     "sum",
     "clip",
     "clip_by_norm",
-    "square",
     "reduce_sum",
     "sqrt",
     "elementwise_max",
     "elementwise_div",
     "elementwise_mul",
-    "scale",   // adamax
-    "assign",  // adamw
+    "scale",           // adamax
+    "assign",          // adamw
+    "squared_l2_norm"  // gradient_clip_norm
 };
 
 const bool startswith(const std::string& str, const std::string& pre) {
@@ -62,6 +62,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
   new_op.SetAttr("with_lr_sched", false);
 
   std::set<std::string> set_ops{};
+  // save the weight decay tensor_name and weight_decay_value for Lamb
+  std::vector<std::string> weight_decay_vars{};
+  std::vector<float> weight_decay_values{};
+
   // use map store <op_type, op_ptr> ?
   for (auto* node : graph->Nodes()) {
     if (!node->IsOp()) {
@@ -75,6 +79,15 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
     auto op_role = static_cast<OpRole>(op_role_);
 
     if (op_role == OpRole::kOptimize) {
+      // save weight decay value from every lamb optimizer op
+      if (op_type == "lamb" && op->HasAttr("weight_decay")) {
+        auto weight_decay_value =
+            BOOST_GET_CONST(float, op->GetAttr("weight_decay"));
+        auto params = op->Output("ParamOut");
+        weight_decay_vars.push_back(params[0]);
+        weight_decay_values.push_back(weight_decay_value);
+      }
+
       if (set_ops.count(op_type)) {
         continue;
       }
@@ -270,7 +283,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
   // seems with_lr_sched is always true
   new_op.SetAttr("with_lr_sched", true);
 
-  // setup weight deacy
+  // setup weight decay for Lamb
+  new_op.SetAttr("weight_decay_vars", weight_decay_vars);
+  new_op.SetAttr("weight_decay_values", weight_decay_values);
+
   // weight_decay/coeff is "scale" attr of scale_op
   if (set_ops.count("scale") && set_ops.count("sum")) {
     if (set_ops.count("sign")) {
diff --git a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
index e754ba72ad857..5cd8358dc083e 100644
--- a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
@@ -30,7 +30,8 @@ void TransferCastOpPass::ApplyImpl(ir::Graph* graph) const {
 
   auto ipu_backend = platform::ipu::IpuBackend::GetInstance();
   auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16;
-  if (enable_fp16) {
+  auto transfer_cast_op = ipu_backend->GetIpuStrategy()->transfer_cast_op;
+  if (enable_fp16 && transfer_cast_op) {
     for (auto* node : graph->Nodes()) {
       if (node->IsOp() && node->Op()->Type() == "popart_cast") {
         if (BOOST_GET_CONST(std::string, node->Op()->GetAttr("to")) ==
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
index 1b2a62695fb13..9fc6de3c8c172 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
@@ -73,8 +73,10 @@ static void ShareVarInfoToCinnLaunch(
       varinfo_maps.at(cinn_launch_op->GetScopeIdx());
 
   // collect all MemOptVarInfos of external variables
-  // that would be eager deleted after the cinn_launch subgraph executed,
-  // and store them as attribute of the subgraph
+  // that were eager deleted after the cinn_launch subgraph executed,
+  // and we will delete them in advance among eager_deletion_ops
+  // inside cinn_launch subgraph, so store them as attribute of the subgraph
+  // to pass to the inner eager_deletion_ops.
   for (const auto& var_name : vars_to_delete) {
     auto it = src_varinfo_map.find(var_name);
     PADDLE_ENFORCE_NE(it, src_varinfo_map.end(),
@@ -82,6 +84,8 @@ static void ShareVarInfoToCinnLaunch(
                           "MemOptVarInfo of var[%s] not found", var_name));
     dst_varinfo_map.emplace(var_name, it->second);
   }
+  // skip running of the followed eager_deletion_op
+  followed_eager_deletion_op->SetSkipRunning(true);
 }
 
 static void TakeVarInfoFromMainGraph(
diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
new file mode 100644
index 0000000000000..4aa59d9196b1b
--- /dev/null
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void MixedPrecisionConfigurePass::InsertCastOps(
+    Graph* graph, const StringSet& blacklist) const {
+  VLOG(3) << "Insert the cast op before and after the kernel that does not "
+             "supports fp16 precision";
+
+  auto update_cast_desc = [&](
+      framework::OpDesc& desc, const std::string& x_name,
+      const std::string& out_name, const int in_dtype, const int out_dtype) {
+    desc.SetType("cast");
+    desc.SetInput("X", {x_name});
+    desc.SetOutput("Out", {out_name});
+    desc.SetAttr("in_dtype", in_dtype);
+    desc.SetAttr("out_dtype", out_dtype);
+    desc.SetAttr("use_mkldnn", false);
+    desc.SetAttr("with_quant_attr", false);
+    desc.Flush();
+  };
+
+  auto cast_input = [&](Graph* graph, Node* op_node,
+                        const StringSet& cast_list) {
+    auto inlinks = op_node->inputs;
+    for (auto* pre_node : inlinks) {
+      if (pre_node->IsVar()) {
+        const auto is_persistable = pre_node->Var()->Persistable();
+        const auto is_float =
+            pre_node->Var()->GetDataType() == proto::VarType::FP16 ||
+            pre_node->Var()->GetDataType() == proto::VarType::FP32 ||
+            pre_node->Var()->GetDataType() == proto::VarType::FP64;
+        if (!is_persistable && is_float) {
+          int suffix = 0;
+          for (auto* pre_node_input : pre_node->inputs) {
+            if (!pre_node_input->IsOp()) continue;
+            const auto& type = pre_node_input->Op()->Type();
+            if (!cast_list.count(type) && type != "cast") {
+              std::string old_name = pre_node->Name();
+              std::string new_name =
+                  old_name + "_cast.tmp_" + std::to_string(suffix);
+              suffix++;
+
+              framework::OpDesc new_op_desc(op_node->Op()->Block());
+              // 4 for fp16, 5 for fp32
+              update_cast_desc(new_op_desc, old_name, new_name, 4, 5);
+              auto* new_op = graph->CreateOpNode(&new_op_desc);
+
+              VarDesc out_var(new_name);
+              out_var.SetPersistable(false);
+              auto* node_var = graph->CreateVarNode(&out_var);
+
+              op_node->Op()->RenameInput(old_name, new_name);
+              IR_NODE_LINK_TO(pre_node, new_op);
+              IR_NODE_LINK_TO(new_op, node_var);
+              IR_NODE_LINK_TO(node_var, op_node);
+            }
+          }
+        }
+      }
+    }
+  };
+
+  auto cast_output = [&](Graph* graph, Node* op_node,
+                         const StringSet& cast_list) {
+    auto outlinks = op_node->outputs;
+    for (auto* next_node : outlinks) {
+      if (next_node->IsVar()) {
+        const auto is_persistable = next_node->Var()->Persistable();
+        const auto is_float =
+            next_node->Var()->GetDataType() == proto::VarType::FP16 ||
+            next_node->Var()->GetDataType() == proto::VarType::FP32 ||
+            next_node->Var()->GetDataType() == proto::VarType::FP64;
+        if (!is_persistable && is_float) {
+          int suffix = 0;
+          for (auto* next_node_output : next_node->outputs) {
+            if (!next_node_output->IsOp()) continue;
+
+            const auto& type = next_node_output->Op()->Type();
+            if (!cast_list.count(type) && type != "cast") {
+              std::string old_name = next_node->Name();
+              std::string new_name =
+                  old_name + "_cast.tmp_" + std::to_string(suffix);
+              suffix++;
+
+              framework::OpDesc new_op_desc(op_node->Op()->Block());
+              // 4 for fp16, 5 for fp32
+              update_cast_desc(new_op_desc, old_name, new_name, 5, 4);
+              auto* new_op = graph->CreateOpNode(&new_op_desc);
+
+              VarDesc out_var(new_name);
+              out_var.SetPersistable(false);
+              auto* node_var = graph->CreateVarNode(&out_var);
+
+              next_node_output->Op()->RenameInput(old_name, new_name);
+              IR_NODE_LINK_TO(next_node, new_op);
+              IR_NODE_LINK_TO(new_op, node_var);
+              IR_NODE_LINK_TO(node_var, next_node_output);
+            }
+          }
+        }
+      }
+    }
+  };
+
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    const auto& type = op_node->Op()->Type();
+    if (blacklist.count(type)) {
+      cast_input(graph, op_node, blacklist);
+      cast_output(graph, op_node, blacklist);
+    }
+  }
+}
+
+void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const {
+  const auto blacklist =
+      Get<std::unordered_set<std::string>>("gpu_fp16_disabled_op_types");
+  InsertCastOps(graph, blacklist);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(mixed_precision_configure_pass,
+              paddle::framework::ir::MixedPrecisionConfigurePass);
diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.h b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h
new file mode 100644
index 0000000000000..fc5a612ecb833
--- /dev/null
+++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using StringSet = std::unordered_set<std::string>;
+
+class MixedPrecisionConfigurePass : public FusePassBase {
+ public:
+  MixedPrecisionConfigurePass() = default;
+  virtual ~MixedPrecisionConfigurePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const override;
+
+ private:
+  void InsertCastOps(Graph* graph, const StringSet& blacklist) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index 2403e60df3918..fc2758c273450 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -118,7 +118,7 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW", "AnyLayout"})
+      .IsStringIn({"NHWC", "NCHW", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("elementwise_add"))
@@ -145,10 +145,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
   patterns::Conv conv_pattern{pattern, name_scope};
   auto conv_output = conv_pattern();
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(
-      conv_output,
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(
+      conv_output, pattern->NewNode(elementwise_pattern.elementwise_y_repr()),
+      "elementwise_add");
   conv_output->AsIntermediate();
 
   int found_conv_as_x_count = 0;
@@ -160,16 +160,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
     GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
 
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_identity, elementwise_y,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
-    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return;
 
-    if (!IsReachable(g, elementwise_add_identity, conv_output)) return;
+    if (!IsReachable(g, elementwise_identity, conv_output)) return;
 
     if (HasFusedActivation(conv_op)) return;
 
@@ -179,14 +179,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
       return;
     }
 
-    conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
-    conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    conv_op->Op()->SetInput("ResidualData", {elementwise_identity->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
     conv_op->Op()->SetAttr("fuse_residual_connection", true);
 
-    GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_op});
 
-    IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
-    IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+    IR_NODE_LINK_TO(elementwise_identity, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_out);
 
     found_conv_as_x_count++;
   };
@@ -212,10 +212,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
   patterns::Conv conv_pattern{pattern, name_scope};
   auto conv_output = conv_pattern();
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
-      conv_output);
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(
+      pattern->NewNode(elementwise_pattern.elementwise_x_repr()), conv_output,
+      "elementwise_add");
   conv_output->AsIntermediate();
 
   int found_conv_as_y_count = 0;
@@ -227,16 +227,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
     GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
 
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
-    if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return;
 
-    if (!IsReachable(g, elementwise_add_x, conv_output)) return;
+    if (!IsReachable(g, elementwise_x, conv_output)) return;
 
     if (HasFusedActivation(conv_op)) return;
 
@@ -246,14 +246,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
       return;
     }
 
-    conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()});
-    conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    conv_op->Op()->SetInput("ResidualData", {elementwise_x->Name()});
+    conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
     conv_op->Op()->SetAttr("fuse_residual_connection", true);
 
-    GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op});
+    GraphSafeRemoveNodes(g, {conv_output, elementwise_op});
 
-    IR_NODE_LINK_TO(elementwise_add_x, conv_op);
-    IR_NODE_LINK_TO(conv_op, elementwise_add_out);
+    IR_NODE_LINK_TO(elementwise_x, conv_op);
+    IR_NODE_LINK_TO(conv_op, elementwise_out);
 
     found_conv_as_y_count++;
   };
@@ -282,8 +282,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
   patterns::Conv conv_y_pattern{pattern, name_scope};
   auto conv_y_output = conv_y_pattern();
 
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
-  elementwise_add_pattern(conv_x_output, conv_y_output);
+  patterns::Elementwise elementwise_pattern{pattern, name_scope};
+  elementwise_pattern(conv_x_output, conv_y_output, "elementwise_add");
   conv_x_output->AsIntermediate();
   conv_y_output->AsIntermediate();
 
@@ -301,10 +301,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
     GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern);
 
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
     if (!IsCompat(subgraph, g)) {
       LOG(WARNING)
@@ -312,8 +312,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
       return;
     }
 
-    if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return;
-    if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_x_op, *elementwise_op) != FUSE_MKLDNN) return;
+    if (FindFuseOption(*conv_y_op, *elementwise_op) != FUSE_MKLDNN) return;
 
     Node* projection_node;
     Node* residual_conv_op;
@@ -333,14 +333,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
     if (HasFusedActivation(residual_conv_op)) return;
 
     residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
-    residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+    residual_conv_op->Op()->SetOutput("Output", {elementwise_out->Name()});
 
     residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
 
-    GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op});
+    GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_op});
 
     IR_NODE_LINK_TO(projection_node, residual_conv_op);
-    IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
+    IR_NODE_LINK_TO(residual_conv_op, elementwise_out);
 
     found_projection_conv_count++;
   };
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 371482b5343d6..f4358fb243f20 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -807,74 +807,74 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
   PrettyLogDetail("---    quantized %d matmul ops", quantize_matmul_count);
 }
 
-void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
+void CPUQuantizePass::QuantizeElementwise(
+    Graph* graph, const std::string elementwise_type) const {
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
-  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_};
+  patterns::Elementwise elementwise_pattern{pattern, name_scope_};
 
-  elementwise_add_pattern(
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
-      pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
+  elementwise_pattern(
+      pattern->NewNode(elementwise_pattern.elementwise_x_repr()),
+      pattern->NewNode(elementwise_pattern.elementwise_y_repr()),
+      elementwise_type);
 
-  int quantize_elementwise_add_count = 0;
+  int quantize_elementwise_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "Quantize elementwise_add op";
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
-                              elementwise_add_pattern);
+    VLOG(4) << "Quantize " + elementwise_type + " op";
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op,
+                              elementwise_pattern);
 
     // skip if should not be quantized
-    if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) {
-      LogQuantizationDisabled(elementwise_add_op);
+    if (!platform::HasOpINT8DataType(elementwise_op->Op())) {
+      LogQuantizationDisabled(elementwise_op);
       return;
     }
 
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
-                              elementwise_add_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
-                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_y, elementwise_y,
+                              elementwise_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out,
+                              elementwise_pattern);
 
     if (!AreScalesPresentForNodes(
-            {elementwise_add_x, elementwise_add_y, elementwise_add_out})) {
-      LogCannotQuantizeOp(elementwise_add_op,
+            {elementwise_x, elementwise_y, elementwise_out})) {
+      LogCannotQuantizeOp(elementwise_op,
                           "No scale available for the operator");
       return;
     }
 
     bool is_x_unsigned{false}, is_y_unsigned{false};
-    auto input_x_scale =
-        GetScaleValueForNode(elementwise_add_x, &is_x_unsigned);
-    auto input_y_scale =
-        GetScaleValueForNode(elementwise_add_y, &is_y_unsigned);
+    auto input_x_scale = GetScaleValueForNode(elementwise_x, &is_x_unsigned);
+    auto input_y_scale = GetScaleValueForNode(elementwise_y, &is_y_unsigned);
 
     // TODO(sfraczek): add support for different signness
     if (is_x_unsigned != is_y_unsigned) {
-      LogCannotQuantizeOp(elementwise_add_op,
-                          "ElementwiseAdd inputs must be of the same type.");
+      LogCannotQuantizeOp(elementwise_op,
+                          "Elementwise inputs must be of the same type.");
       return;
     }
 
-    QuantizeInput(g, elementwise_add_op, elementwise_add_x, "X", input_x_scale,
+    QuantizeInput(g, elementwise_op, elementwise_x, "X", input_x_scale,
                   is_x_unsigned, "Scale_x");
-    QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale,
+    QuantizeInput(g, elementwise_op, elementwise_y, "Y", input_y_scale,
                   is_y_unsigned, "Scale_y");
 
     bool is_output_unsigned{false};
     auto output_scale =
-        GetScaleValueForNode(elementwise_add_out, &is_output_unsigned);
+        GetScaleValueForNode(elementwise_out, &is_output_unsigned);
 
-    DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out",
-                     output_scale, is_output_unsigned, "Scale_out");
+    DequantizeOutput(g, elementwise_op, elementwise_out, "Out", output_scale,
+                     is_output_unsigned, "Scale_out");
 
-    ++quantize_elementwise_add_count;
+    ++quantize_elementwise_count;
   };
   gpd(graph, handler);
-  AddStatis(quantize_elementwise_add_count);
+  AddStatis(quantize_elementwise_count);
 
-  PrettyLogDetail("---    quantized %d elementwise_add ops",
-                  quantize_elementwise_add_count);
+  PrettyLogDetail("---    quantized %d %s ops", quantize_elementwise_count,
+                  elementwise_type);
 }
 
 void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
@@ -1146,7 +1146,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeFc(graph);
   QuantizeReshape(graph);
   QuantizeMatmul(graph);
-  QuantizeElementwiseAdd(graph);
+  QuantizeElementwise(graph, "elementwise_add");
+  QuantizeElementwise(graph, "elementwise_mul");
   QuantizeFusionGru(graph);
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index 412c4e40a01d5..3a286264e41ff 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -57,7 +57,8 @@ class CPUQuantizePass : public FusePassBase {
   void QuantizeTranspose(Graph* graph) const;
   void QuantizeReshape(Graph* graph) const;
   void QuantizeMatmul(Graph* graph) const;
-  void QuantizeElementwiseAdd(Graph* graph) const;
+  void QuantizeElementwise(Graph* graph,
+                           const std::string elementwise_type) const;
   void QuantizeFusionGru(Graph* graph) const;
   void QuantizeMultiGru(Graph* graph) const;
   void QuantizeFusionLSTM(Graph* graph) const;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 889417b78c864..22000865948d6 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -90,7 +90,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
-  } else if (type == "elementwise_add") {
+  } else if (type == "elementwise_add" || type == "elementwise_mul") {
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
@@ -167,7 +167,8 @@ void CheckScales(const OpDesc* op, float scale, float shift) {
               scale);
     scale_names.push_back("Scale_in");
     scale_names.push_back("Scale_out");
-  } else if (type == "matmul" || type == "elementwise_add") {
+  } else if (type == "matmul" || type == "elementwise_add" ||
+             type == "elementwise_mul") {
     scale_names.push_back("Scale_x");
     scale_names.push_back("Scale_y");
     scale_names.push_back("Scale_out");
@@ -546,46 +547,77 @@ TEST(CpuQuantizePass, matmul_not_quantized) {
            expected_operators, added_nodes, 1.0f);
 }
 
-static const std::initializer_list<std::string> variable_names_elementwise_add =
-    {"a", "b", "c", "d", "e", "f"};
+static const std::initializer_list<std::string> variable_names_elementwise = {
+    "a", "b", "c", "d", "e", "f"};
 
-ProgramDesc BuildProgramDescElementwiseAdd() {
+ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type,
+                                        const std::string elementwise_name) {
   ProgramDesc prog;
-  for (auto& v : variable_names_elementwise_add) {
+  for (auto& v : variable_names_elementwise) {
     prog.MutableBlock(0)->Var(v);
   }
   SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
   SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
-  SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true,
+  SetOp(&prog, elementwise_type, elementwise_name, {"b", "d"}, {"e"}, true,
         "int8");
   SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
 
   return prog;
 }
 
-TEST(CpuQuantizePass, elementwise_add) {
+void TestElementwise(const std::string elementwise_type,
+                     const std::string elementwise_name) {
   // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
   int added_nodes = 6;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 2}, {"dequantize", 3}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, SCALE * S8_MAX);
+      {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes,
+           SCALE * S8_MAX);
 }
 
-TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
+void TestElementwiseOutputScaleMissing(const std::string elementwise_type,
+                                       const std::string elementwise_name) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, 1.f, 1.f, "e");
+      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes, 1.f,
+           1.f, "e");
 }
 
-TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
+void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type,
+                                           const std::string elementwise_name) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
-           expected_operators, added_nodes, 1.f, 1.f, "", "b");
+      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+           variable_names_elementwise, expected_operators, added_nodes, 1.f,
+           1.f, "", "b");
+}
+
+TEST(CpuQuantizePass, elementwise_add) {
+  TestElementwise("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
+  TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
+  TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd");
+}
+
+TEST(CpuQuantizePass, elementwise_mul) {
+  TestElementwise("elementwise_mul", "ElementwiseMul");
+}
+
+TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) {
+  TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul");
+}
+
+TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) {
+  TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul");
 }
 
 const std::vector<std::string> churn_out_vars(ProgramDesc* prog,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 5f74b61ee86aa..3b883dac9782a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -26,10 +26,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Marks operators which are to be quantized.";
   std::unordered_set<std::string> supported_op_types =
       std::unordered_set<std::string>(
-          {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc",
-           "matmul", "nearest_interp", "nearest_interp_v2", "pool2d",
-           "prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm",
-           "multi_gru", "slice"});
+          {"concat", "conv2d", "depthwise_conv2d", "elementwise_add",
+           "elementwise_mul", "fc", "matmul", "nearest_interp",
+           "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2",
+           "fusion_gru", "fusion_lstm", "multi_gru", "slice"});
   const auto& excluded_ids_list =
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 11190309814e7..4236dc55d5186 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -25,15 +25,22 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(conv2d_transpose, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(batch_norm, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(gelu, CPU, ALL_LAYOUT);
 
 USE_OP_ITSELF(batch_norm);
 USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
-USE_OP(conv2d_transpose);
+USE_OP_ITSELF(conv2d_transpose);
 USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
-USE_OP(gelu);
+USE_OP_ITSELF(gelu);
 USE_OP_DEVICE_KERNEL(gelu, MKLDNN);
+PD_DECLARE_ARG_MAPPING_FN(gelu);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index d578ada0db00f..7df957b2c0eca 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -18,6 +18,7 @@
 #include <unordered_set>
 
 #include <boost/logic/tribool.hpp>
+
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -25,12 +26,13 @@ USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
-USE_OP(leaky_relu);
+USE_OP_ITSELF(leaky_relu);
 USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
-USE_OP(gelu);
+USE_OP_ITSELF(gelu);
 USE_OP_ITSELF(relu);
-USE_OP(tanh);
+USE_OP_ITSELF(tanh);
 USE_OP_DEVICE_KERNEL(tanh, MKLDNN);
+PD_DECLARE_ARG_MAPPING_FN(gelu);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 7e61d6ae4248b..8c51c278d4872 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -125,6 +125,7 @@ class Node {
   // Only use this for auto parallel.
   // A node does not have original desc if the return is zero.
   uint64_t OriginalDescId() const { return original_desc_id_; }
+  int GraphId() const { return graph_id_; }
 
   bool IsOp() const { return type_ == Type::kOperation; }
   bool IsVar() const { return type_ == Type::kVariable; }
@@ -246,10 +247,12 @@ class Node {
   // Store the original id of var desc or op desc.
   // Only use this for auto parallel.
   uint64_t original_desc_id_{0};
+  int graph_id_{-1};
 
  private:
   // ID can only set by a Graph.
   void SetId(int id) { id_ = id; }
+  void SetGraphId(int graph_id) { graph_id_ = graph_id; }
 
   // desc_order can only set by a Graph when constructing a Graph from a
   // BlockDesc.
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index ece4815858640..f30d1ea1b83dd 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -41,6 +41,7 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
 void NaiveExecutor::Run() {
 #ifdef PADDLE_WITH_MKLDNN
   platform::AttachPointerHashToMKLDNNKey(this, place_);
+  platform::RegisterModelLayout(ops_, place_);
 #endif
   platform::ScopedFlushDenormal flush;
   for (auto &op : ops_) {
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index 1ae7d53799323..b7b09da5ce03a 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -22,8 +22,6 @@ cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
 
 # cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 # skip win32 since wget is not installed by default on windows machine.
-# skip COVERAGE_CI since the test runs slowly because of instrumentation.
-
 if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     add_custom_target(
         download_program
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 878b845211ca1..62e801b76955d 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -41,6 +41,7 @@ namespace paddle {
 namespace framework {
 // NOTE(Aurelius84): Need a better strategy to determine it.
 static constexpr size_t kHostNumThreads = 4;
+static constexpr size_t kDeviceNumThreads = 1;
 
 bool IsInterpretercoreFastGCEnabled() {
   return FLAGS_fast_eager_deletion_mode && FLAGS_use_stream_safe_cuda_allocator;
@@ -54,8 +55,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
       global_scope_(global_scope),
       stream_analyzer_(place) {
   is_build_ = false;
-  async_work_queue_.reset(
-      new interpreter::AsyncWorkQueue(kHostNumThreads, &main_thread_blocker_));
+  async_work_queue_.reset(new interpreter::AsyncWorkQueue(
+      kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_));
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (IsInterpretercoreFastGCEnabled()) {
@@ -271,6 +272,10 @@ void InterpreterCore::Convert(
   if (FLAGS_new_executor_use_inplace) {
     BuildInplace();
   }
+
+  // prepare for the first time.
+  async_work_queue_->PrepareAtomicDeps(dependecy_count_);
+  async_work_queue_->PrepareAtomicVarRef(vec_meta_info);
 }
 
 bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
@@ -388,18 +393,18 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
                            : global_scope_->GetMutableScope();
   auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
   {
-    platform::RecordEvent infershape_event(
-        "infer_shape", platform::TracerEventType::OperatorInner, 1,
-        platform::EventRole::kInnerOp);
-    // If it is OperatorBase, InferShape do nothing.
-    if (op_with_kernel != nullptr)
+    if (op_with_kernel != nullptr) {
+      platform::RecordEvent infershape_event(
+          "infer_shape", platform::TracerEventType::OperatorInner, 1,
+          platform::EventRole::kInnerOp);
+      // If it is OperatorBase, InferShape do nothing.
       op_with_kernel->Info().infer_shape_(
           instr_node.InnerInferShapeContext().get());
+    }
   }
 
-  if (op_with_kernel != nullptr &&
-      FLAGS_new_executor_use_inplace) {  // TODO(xiongkun03) Does operator
-                                         // base support inplace ?
+  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
+    // TODO(xiongkun03) Does operator base support inplace ?
     for (auto& pair : instr_node.InplaceInfo()) {
       const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
       auto* out =
@@ -409,6 +414,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
       }
     }
   }
+
   {
     platform::RecordEvent compute_event(
         "compute", platform::TracerEventType::OperatorInner, 1,
@@ -458,16 +464,24 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
 
 void InterpreterCore::ExecuteInstructionList(
     const std::vector<Instruction>& vec_instr) {
+  // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare
+  // those for the next step
+  auto atomic_deps = async_work_queue_->AtomicDeps();
+  auto atomic_var_ref = async_work_queue_->AtomicVarRef();
+
   async_work_queue_->PrepareAtomicDeps(dependecy_count_);
   async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo());
+
   unfinished_op_numer_ = vec_instr.size();
 
   exception_holder_.Clear();
 
   for (size_t i = 0; i < dependecy_count_.size(); ++i) {
     if (dependecy_count_[i] == 0) {
-      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
-                                 [&, i] { RunInstructionAsync(i); });
+      async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [
+        this, i, atomic_deps = atomic_deps.get(),
+        atomic_var_ref = atomic_var_ref.get()
+      ] { RunInstructionAsync(i, atomic_deps, atomic_var_ref); });
     }
   }
 
@@ -490,11 +504,16 @@ void InterpreterCore::ExecuteInstructionList(
 }
 
 void InterpreterCore::RunNextInstructions(
-    const Instruction& instr, std::queue<size_t>* reserved_next_ops) {
+    const Instruction& instr, std::queue<size_t>* reserved_next_ops,
+    std::vector<std::atomic<size_t>>* atomic_deps,
+    std::vector<std::atomic<size_t>>* atomic_var_ref) {
+  VLOG(4) << "atomic 1:" << atomic_deps;
   auto& next_instr = instr.NextInstructions();
-  auto& atomic_deps = async_work_queue_->AtomicDeps();
-  auto IsReady = [&](size_t next_id) {
-    return atomic_deps[next_id]->fetch_sub(1, std::memory_order_relaxed) == 1;
+
+  auto IsReady = [atomic_deps](size_t next_id) {
+    VLOG(4) << "atomic:" << atomic_deps << " " << &(*atomic_deps)[next_id]
+            << " " << next_id;
+    return (*atomic_deps)[next_id].fetch_sub(1, std::memory_order_relaxed) == 1;
   };
 
   if (instr.KernelType() == OpFuncType::kQueueAsync) {
@@ -503,7 +522,9 @@ void InterpreterCore::RunNextInstructions(
       if (IsReady(next_id)) {
         async_work_queue_->AddTask(
             vec_instruction_[next_id].KernelType(),
-            [&, next_id] { RunInstructionAsync(next_id); });
+            [this, next_id, atomic_deps, atomic_var_ref]() {
+              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
+            });
       }
     }
     // keep all async_ops running in current thread
@@ -523,7 +544,9 @@ void InterpreterCore::RunNextInstructions(
       if (IsReady(next_id)) {
         async_work_queue_->AddTask(
             vec_instruction_[next_id].KernelType(),
-            [&, next_id] { RunInstructionAsync(next_id); });
+            [this, next_id, atomic_deps, atomic_var_ref] {
+              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
+            });
       }
     }
     auto direct_run_ops = interpreter::merge_vector(next_instr.SyncRunIds(),
@@ -539,14 +562,18 @@ void InterpreterCore::RunNextInstructions(
         // move rest ops into other threads
         async_work_queue_->AddTask(
             vec_instruction_[next_id].KernelType(),
-            [&, next_id] { RunInstructionAsync(next_id); });
+            [this, next_id, atomic_deps, atomic_var_ref] {
+              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
+            });
       }
     }
     if (first_op != 0) reserved_next_ops->push(first_op);
   }
 }
 
-void InterpreterCore::RunInstructionAsync(size_t instr_id) {
+void InterpreterCore::RunInstructionAsync(
+    size_t instr_id, std::vector<std::atomic<size_t>>* atomic_deps,
+    std::vector<std::atomic<size_t>>* atomic_var_ref) {
   std::queue<size_t> ready_ops;
   ready_ops.push(instr_id);
   while (!ready_ops.empty()) {
@@ -571,7 +598,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       RecordStreamForGC(instr_node);
 #endif
-      CheckGC(instr_node);
+      CheckGC(instr_node, atomic_var_ref);
     } catch (platform::EnforceNotMet& ex) {
       framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
       exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
@@ -605,7 +632,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
 
     interpreter::RecordEvent(instr_node, place_);
 
-    RunNextInstructions(instr_node, &ready_ops);
+    RunNextInstructions(instr_node, &ready_ops, atomic_deps, atomic_var_ref);
   }
 }
 
@@ -703,17 +730,19 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
 }
 #endif
 
-void InterpreterCore::CheckGC(const Instruction& instr) {
+void InterpreterCore::CheckGC(
+    const Instruction& instr,
+    std::vector<std::atomic<size_t>>* atomic_var_ref) {
   size_t instr_id = instr.Id();
   auto& var_scope = *global_scope_;
-  auto& atomic_var_ref = async_work_queue_->AtomicVarRef();
 
   for (auto var_id : instr.GCCheckVars()) {
     VLOG(4) << "GC " << global_scope_->GetNameById(var_id) << " "
             << var_scope.VarDesc(var_id);
-
+    VLOG(4) << "atomic:" << atomic_var_ref << " " << &(*atomic_var_ref)[var_id]
+            << " " << var_id;
     bool is_ready =
-        atomic_var_ref[var_id]->fetch_sub(1, std::memory_order_relaxed) == 1;
+        (*atomic_var_ref)[var_id].fetch_sub(1, std::memory_order_relaxed) == 1;
     // ignore all persistable var while GC
     if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
       continue;
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 51734abbb1bf8..c1ade85e1384c 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -76,11 +76,16 @@ class InterpreterCore {
   void RecordStreamForGC(const Instruction& instr);
 #endif
 
-  void CheckGC(const Instruction& instr);
+  void CheckGC(const Instruction& instr,
+               std::vector<std::atomic<size_t>>* atomic_var_ref);
 
-  void RunInstructionAsync(size_t instr_id);
+  void RunInstructionAsync(size_t instr_id,
+                           std::vector<std::atomic<size_t>>* atomic_deps,
+                           std::vector<std::atomic<size_t>>* atomic_var_ref);
   void RunNextInstructions(const Instruction& instr_id,
-                           std::queue<size_t>* reserved_next_ops);
+                           std::queue<size_t>* reserved_next_ops,
+                           std::vector<std::atomic<size_t>>* atomic_deps,
+                           std::vector<std::atomic<size_t>>* atomic_var_ref);
 
   void BuildSkipShareLoDInfo();
 
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index d595af58257d4..a045d6c7f4a65 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -44,32 +44,37 @@ void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type,
 
 using VariableIdMap = std::map<std::string, std::vector<int>>;
 
-AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicDeps(
+void AsyncWorkQueue::PrepareAtomicDeps(
     const std::vector<size_t>& dependecy_count) {
-  if (atomic_deps_.size() != dependecy_count.size()) {
-    atomic_deps_.clear();
-    std::generate_n(std::back_inserter(atomic_deps_), dependecy_count.size(),
-                    [] { return std::make_unique<std::atomic<size_t>>(0); });
-  }
-
-  for (size_t i = 0; i < dependecy_count.size(); ++i) {
-    atomic_deps_[i]->store(dependecy_count[i]);
-  }
-  return atomic_deps_;
+  VLOG(4) << "PrepareAtomicDeps";
+  auto p = std::make_shared<
+      std::promise<std::unique_ptr<std::vector<std::atomic<size_t>>>>>();
+  atomic_deps_ = p->get_future();
+  queue_group_->AddTask(2, [&dependecy_count, p] {
+    auto* op_deps =
+        new std::vector<std::atomic<size_t>>(dependecy_count.size());
+    for (size_t i = 0; i < dependecy_count.size(); ++i) {
+      (*op_deps)[i] = dependecy_count[i];
+    }
+    VLOG(4) << "AtomicDeps:" << op_deps << " " << (*op_deps).size();
+    p->set_value(std::unique_ptr<std::vector<std::atomic<size_t>>>(op_deps));
+  });
 }
 
-AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicVarRef(
+void AsyncWorkQueue::PrepareAtomicVarRef(
     const std::vector<VariableMetaInfo>& vec_meta_info) {
-  if (atomic_var_ref_.size() != vec_meta_info.size()) {
-    atomic_var_ref_.clear();
-    std::generate_n(std::back_inserter(atomic_var_ref_), vec_meta_info.size(),
-                    [] { return std::make_unique<std::atomic<size_t>>(0); });
-  }
-
-  for (size_t i = 0; i < vec_meta_info.size(); ++i) {
-    atomic_var_ref_[i]->store(vec_meta_info[i].var_ref_count_);
-  }
-  return atomic_var_ref_;
+  VLOG(4) << "PrepareAtomicVarRef";
+  auto p = std::make_shared<
+      std::promise<std::unique_ptr<std::vector<std::atomic<size_t>>>>>();
+  atomic_var_ref_ = p->get_future();
+  queue_group_->AddTask(2, [&vec_meta_info, p] {
+    auto* var_ref = new std::vector<std::atomic<size_t>>(vec_meta_info.size());
+    for (size_t i = 0; i < vec_meta_info.size(); ++i) {
+      (*var_ref)[i] = vec_meta_info[i].var_ref_count_;
+    }
+    VLOG(4) << "AtomicVarRef:" << var_ref << " " << (*var_ref).size();
+    p->set_value(std::unique_ptr<std::vector<std::atomic<size_t>>>(var_ref));
+  });
 }
 
 bool var_can_be_deleted(const std::string& name, const BlockDesc& block) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index 81c05df62ec41..044a9ea368cbc 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -50,11 +50,13 @@ namespace framework {
 
 namespace interpreter {
 
-using AtomicVectorSizeT = std::vector<std::unique_ptr<std::atomic<size_t>>>;
+using AtomicVectorSizeT =
+    std::future<std::unique_ptr<std::vector<std::atomic<size_t>>>>;
 
 class AsyncWorkQueue {
  public:
-  AsyncWorkQueue(size_t host_num_threads, EventsWaiter* waiter)
+  AsyncWorkQueue(size_t host_num_threads, size_t deivce_num_threads,
+                 EventsWaiter* waiter)
       : host_num_thread_(host_num_threads) {
     std::vector<WorkQueueOptions> group_options;
     // for execute host Kernel
@@ -66,6 +68,13 @@ class AsyncWorkQueue {
                                /*events_waiter*/ waiter);
     // for launch device Kernel
     group_options.emplace_back(/*name*/ "DeviceKernelLaunch",
+                               /*num_threads*/ deivce_num_threads,
+                               /*allow_spinning*/ true,
+                               /*track_task*/ false,
+                               /*detached*/ true,
+                               /*events_waiter*/ waiter);
+    // for prepare deps and others
+    group_options.emplace_back(/*name*/ "Prepare",
                                /*num_threads*/ 1,
                                /*allow_spinning*/ true,
                                /*track_task*/ false,
@@ -74,10 +83,8 @@ class AsyncWorkQueue {
     queue_group_ = CreateWorkQueueGroup(group_options);
   }
 
-  AtomicVectorSizeT& PrepareAtomicDeps(
-      const std::vector<size_t>& dependecy_count);
-  AtomicVectorSizeT& PrepareAtomicVarRef(
-      const std::vector<VariableMetaInfo>& vec_meta_info);
+  void PrepareAtomicDeps(const std::vector<size_t>& dependecy_count);
+  void PrepareAtomicVarRef(const std::vector<VariableMetaInfo>& vec_meta_info);
 
   // void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); }
 
@@ -85,8 +92,12 @@ class AsyncWorkQueue {
 
   void Cancel() { queue_group_->Cancel(); }
 
-  AtomicVectorSizeT& AtomicDeps() { return atomic_deps_; }
-  AtomicVectorSizeT& AtomicVarRef() { return atomic_var_ref_; }
+  std::unique_ptr<std::vector<std::atomic<size_t>>> AtomicDeps() {
+    return atomic_deps_.get();
+  }
+  std::unique_ptr<std::vector<std::atomic<size_t>>> AtomicVarRef() {
+    return atomic_var_ref_.get();
+  }
 
  private:
   size_t host_num_thread_;
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 81d2c3d936b9c..4231c75748167 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -20,42 +20,43 @@
 // #include "gperftools/profiler.h"
 
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(fill_constant);
-USE_OP(uniform_random);
+USE_OP_ITSELF(uniform_random);
 USE_OP(lookup_table);
-USE_OP(transpose2);
+USE_OP_ITSELF(transpose2);
 USE_OP_ITSELF(reshape2);
-USE_OP(split);
-USE_OP(slice);
-USE_OP(concat);
-USE_OP(matmul);
+USE_OP_ITSELF(split);
+USE_OP_ITSELF(slice);
+USE_OP_ITSELF(concat);
+USE_OP_ITSELF(matmul);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(sigmoid);
-USE_OP(tanh);
-USE_OP(elementwise_mul);
+USE_OP_ITSELF(sigmoid);
+USE_OP_ITSELF(tanh);
+USE_OP_ITSELF(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
 USE_OP_ITSELF(reduce_mean);
 USE_OP_ITSELF(reduce_sum);
 USE_OP_ITSELF(reduce_sum_grad);
-USE_OP(reduce_mean_grad);
+USE_OP_ITSELF(reduce_mean_grad);
 USE_OP_ITSELF(reshape2_grad);
-USE_OP(softmax_with_cross_entropy_grad);
+USE_OP_ITSELF(softmax_with_cross_entropy_grad);
 USE_OP_ITSELF(elementwise_add_grad);
-USE_OP(matmul_grad);
-USE_OP(square);
-USE_OP(transpose2_grad);
+USE_OP_ITSELF(matmul_grad);
+USE_OP_ITSELF(square);
+USE_OP_ITSELF(transpose2_grad);
 USE_OP(concat_grad);
 USE_OP_ITSELF(elementwise_mul_grad);
-USE_OP(sigmoid_grad);
-USE_OP(tanh_grad);
+USE_OP_ITSELF(sigmoid_grad);
+USE_OP_ITSELF(tanh_grad);
 USE_OP(sum);
-USE_OP(slice_grad);
-USE_OP(lookup_table_grad);
+USE_OP_ITSELF(slice_grad);
+USE_OP_ITSELF(lookup_table_grad);
 USE_OP(sqrt);
 USE_OP(elementwise_max);
 USE_OP_ITSELF(elementwise_div);
-USE_OP(sgd);
+USE_OP_ITSELF(sgd);
 USE_OP(squared_l2_norm);
 USE_OP_ITSELF(memcpy_h2d);
 USE_OP_ITSELF(memcpy_d2h);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f8e30c1ee294e..15777c287b422 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -628,10 +628,12 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
 
 bool OpSupportGPU(const std::string& op_type) {
   // check in new Function kernel first
+  bool has_phi_kernel = false;
   auto& kernel_factory = phi::KernelFactory::Instance();
   auto kernel_key_map =
       kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type));
   for (auto& kernel : kernel_key_map) {
+    has_phi_kernel = true;
     if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) {
       return true;
     }
@@ -639,12 +641,19 @@ bool OpSupportGPU(const std::string& op_type) {
 
   auto& all_kernels = OperatorWithKernel::AllOpKernels();
   auto it = all_kernels.find(op_type);
-  if (it == all_kernels.end()) {
-    // All control operator must support GPU
-    return true;
-  }
-  for (auto& kern_pair : it->second) {
-    if (platform::is_gpu_place(kern_pair.first.place_)) {
+  if (it != all_kernels.end()) {
+    for (auto& kern_pair : it->second) {
+      if (platform::is_gpu_place(kern_pair.first.place_)) {
+        return true;
+      }
+    }
+  } else {
+    if (has_phi_kernel) {
+      // if has phi kernel, but not find phi gpu kernel and fluid gpu kernel,
+      // this op doesn't support GPU
+      return false;
+    } else {
+      // All control operator must support GPU
       return true;
     }
   }
@@ -1113,7 +1122,15 @@ static void CheckTensorNANOrInf(const std::string& op_type,
 
 bool OperatorWithKernel::SupportsMKLDNN(
     const proto::VarType::Type data_type) const {
-  auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
+  auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_);
+  if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) {
+    VLOG(6) << "Warning: " << type_ << " don't find its MKLDNN Kernel in Fluid "
+                                       "Registered Kernels. And We don't "
+                                       "search its kernels in phi lib, "
+                                       "SupportsMKLDNN() return false.";
+    return false;
+  }
+  auto& op_kernels = op_kernel_iter->second;
   return std::any_of(op_kernels.begin(), op_kernels.end(),
                      [data_type](OpKernelMap::const_reference kern_pair) {
                        return platform::is_cpu_place(kern_pair.first.place_) &&
@@ -1456,7 +1473,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
-#ifdef PADDLE_WITH_XPU
+
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   if (platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() ||
        !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
@@ -1470,17 +1488,36 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 #endif
 
 #ifdef PADDLE_WITH_XPU_KP
-  bool use_xpu_kp_kernel_rt =
-      FLAGS_run_kp_kernel &&
-      paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
-  bool use_xpu_kp_kernel_debug =
-      paddle::platform::is_in_xpu_kpwhite_list(type_);
-  if (platform::is_xpu_place(expected_kernel_key.place_) &&
-      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
-    expected_kernel_key.library_type_ = LibraryType::kKP;
-    kernel_iter = kernels.find(expected_kernel_key);
-    VLOG(3) << "using XPU KP kernel: " << type_
-            << ", using_kernel_key:" << expected_kernel_key;
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
+    bool use_xpu_kp_kernel_rt =
+        FLAGS_run_kp_kernel &&
+        paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key);
+    bool use_xpu_kp_kernel_debug =
+        paddle::platform::is_in_xpu_kpwhite_list(type_);
+    if (use_xpu_kp_kernel_rt) {
+      VLOG(3) << "xpu_kp using rt mode ";
+    }
+    if (use_xpu_kp_kernel_debug) {
+      VLOG(3) << "xpu_kp using debug mode ";
+    }
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+    if (is_xpu_kp_support) {
+      expected_kernel_key.library_type_ = LibraryType::kKP;
+      kernel_iter = kernels.find(expected_kernel_key);
+      VLOG(3) << "using XPU KP kernel: " << type_
+              << ", using_kernel_key:" << expected_kernel_key;
+    }
+    bool is_xpu_unsupport =
+        (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
+         paddle::platform::is_in_xpu_black_list(type_));
+    if (!is_xpu_kp_support &&
+        (kernel_iter == kernels.end() || is_xpu_unsupport)) {
+      VLOG(3) << "missing XPU kernel: " << type_
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
   }
 #endif
 
@@ -2083,16 +2120,25 @@ void OperatorWithKernel::BuildPhiKernelContext(
       auto* var = ins_vector[offset];
       if (var->IsType<framework::LoDTensor>()) {
         tensor_in = &(var->Get<framework::LoDTensor>());
+        pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var->IsType<phi::SelectedRows>()) {
         tensor_in = &(var->Get<phi::SelectedRows>());
+        pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        auto& tensor_array = var->Get<framework::LoDTensorArray>();
+        for (auto& t : tensor_array) {
+          tensor_vector.emplace_back(&t);
+        }
+        pt_kernel_context->EmplaceBackInputsWithoutSetRange(tensor_vector);
+        end_idx += tensor_array.size() - 1;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported input `%s` type when call pt kernel.",
             framework::ToTypeName(var->Type())));
       }
-
-      pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
     }
+    // Note: here cannot deal with vector<LoDTensorArray> input
     pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
   VLOG(4) << "Done inputs";
@@ -2120,22 +2166,33 @@ void OperatorWithKernel::BuildPhiKernelContext(
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
       phi::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset];
-
       if (var) {
         if (var->template IsType<framework::LoDTensor>()) {
           tensor_out = var->template GetMutable<framework::LoDTensor>();
+          pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<phi::SelectedRows>()) {
           tensor_out = var->template GetMutable<phi::SelectedRows>();
+          pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::LoDTensorArray>()) {
+          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          auto* tensor_array =
+              var->template GetMutable<framework::LoDTensorArray>();
+          // Note: If the input LoDTensorArray size is 0, the output
+          // LoDTensorArray is also 0
+          for (auto& t : *tensor_array) {
+            tensor_vector.emplace_back(&t);
+          }
+          pt_kernel_context->EmplaceBackOutputsWithoutSetRange(tensor_vector);
+          end_idx += tensor_array->size() - 1;
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported output `%s` type when call pt kernel.",
               framework::ToTypeName(var->Type())));
         }
+      } else {
+        pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
       }
-
-      pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
-
     pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
   VLOG(4) << "Done outputs";
@@ -2250,42 +2307,67 @@ void OperatorWithKernel::BuildPhiKernelContext(
       }
     } else {
       // TODO(chenweihang): support other attrs later
-      auto& attr = Attrs().at(attr_names[i]);
+      auto attr_it = attrs_.find(attr_names[i]);
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+        if (attr_it == attrs_.end()) {
+          auto in_it = ctx.inputs.find(attr_names[i]);
+          if (in_it != ctx.inputs.end()) {
+            // get data from input
+            auto val = experimental::MakePhiScalarFromVar(*(in_it->second[0]));
+            int32_t val_int = val.template to<int32_t>();
+            pt_kernel_context->EmplaceBackAttr(val_int);
+          } else {
+            PADDLE_THROW(platform::errors::NotFound(
+                "can not find attribute `%s` both in attribute and input ",
+                attr_names[i]));
+          }
+        } else {
+          pt_kernel_context->EmplaceBackAttr(
+              BOOST_GET_CONST(int, attr_it->second));
+        }
       } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(float, attr_it->second));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(bool, attr_it->second));
       } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(int64_t, attr_it->second));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::string))) {
-        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(std::string, attr_it->second));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(phi::DataType))) {
         auto data_type = paddle::framework::TransToPhiDataType(
             static_cast<framework::proto::VarType::Type>(
-                BOOST_GET_CONST(int, attr)));
+                BOOST_GET_CONST(int, attr_it->second)));
         pt_kernel_context->EmplaceBackAttr(data_type);
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
-        if (std::type_index(attr.type()) ==
+        if (std::type_index(attr_it->second.type()) ==
             std::type_index(typeid(std::vector<int64_t>))) {
           pt_kernel_context->EmplaceBackAttr(
-              BOOST_GET_CONST(std::vector<int64_t>, attr));
-        } else if (std::type_index(attr.type()) ==
+              BOOST_GET_CONST(std::vector<int64_t>, attr_it->second));
+        } else if (std::type_index(attr_it->second.type()) ==
                    std::type_index(typeid(std::vector<int>))) {
           // Emplace Back Attr according to the type of Phi_Kernel args.
-          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
+          const auto& vector_int_attr =
+              BOOST_GET_CONST(std::vector<int>, attr_it->second);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
                                                        vector_int_attr.end());
           pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
         }
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int32_t>))) {
-        const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
+        const auto& vector_int_attr =
+            BOOST_GET_CONST(std::vector<int>, attr_it->second);
         pt_kernel_context->EmplaceBackAttr(vector_int_attr);
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(std::vector<std::string>))) {
+        pt_kernel_context->EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<std::string>, attr_it->second));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported cast op attribute `%s` when construct "
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 1a1171f1dba4d..6f68c261d2b24 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -483,6 +483,10 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
     return ctx_.InputVar(name)->IsType<phi::SelectedRows>();
   }
 
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    return ctx_.InputVar(name)->IsType<framework::LoDTensorArray>();
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     return ctx_.OutputVar(name)->IsType<framework::LoDTensor>();
   }
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index db4f6761bcec9..1669fba1327e5 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -144,6 +144,9 @@ class Scope : public ScopeBase {
   void Rename(const std::string& origin_name,
               const std::string& new_name) const;
 
+  // Return the number of variables in scope
+  size_t Size() { return vars_.size(); }
+
   // Rename variable to a new name and return the new name
   std::string Rename(const std::string& origin_name) const;
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 10ceae62dccbb..e8cd84248ea85 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -79,18 +79,6 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#ifdef PADDLE_WITH_IPU
-  else if (platform::is_ipu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_ipu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (platform::is_ipu_place(src_place) &&
-             platform::is_ipu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   else if (platform::is_custom_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
@@ -390,6 +378,29 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
         "Copying from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copying from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 }
 
 template <typename TENSOR>
@@ -447,27 +458,15 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#ifdef PADDLE_WITH_IPU
-  else if (platform::is_ipu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
-             platform::is_ipu_place(dst_place)) {
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  } else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copy from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   else if (platform::is_custom_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {     /* custom_device -> cpu*/
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
+  }                                                // NOLINT
   else if (platform::is_cpu_place(src_place) &&    // NOLINT
            platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
+  }                                                 // NOLINT
   else if (platform::is_custom_place(src_place) &&  // NOLINT
            platform::is_custom_place(
                dst_place)) { /* custom_device -> custom_device*/
@@ -483,11 +482,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
+  }                                              // NOLINT
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
            platform::is_xpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
-  }
+  }                                              // NOLINT
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_xpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
@@ -502,7 +501,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
       auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place);
       xpu_ctx->Wait();
     }
-  }
+  }       // NOLINT
   else {  // NOLINT
     PADDLE_THROW(platform::errors::Unimplemented(
         "Copy from %s to %s is not supported.", src_place, dst_place));
@@ -601,6 +600,29 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_ipu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 }
 
 template <typename Predicate, typename DevCtx>
@@ -1224,8 +1246,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
   proto::VarType::TensorDesc desc;
   {  // int32_t size
      // proto buffer
-    int32_t size;
+    int32_t size = -1;
     is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable(
+                                           "Cannot read tensor desc size"));
+    PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument(
+                                   "Tensor desc size should >= 0"));
     std::unique_ptr<char[]> buf(new char[size]);
     is.read(reinterpret_cast<char*>(buf.get()), size);
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 149202468be6c..7d60b7d26f3fb 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -124,7 +124,7 @@ AmpOperators::AmpOperators()
       OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16));
   unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(),
                                 unsupported_ops_gpu_bf16.end());
-// NOTE: GPU/NPU/XPU is compiled seperatly.
+// NOTE: GPU/NPU/XPU/MLU is compiled seperatly.
 #elif defined(PADDLE_WITH_ASCEND_CL)
   auto unsupported_ops_npu_fp16 = std::get<2>(
       OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16));
@@ -143,6 +143,15 @@ AmpOperators::AmpOperators()
       OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16));
   unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(),
                                 unsupported_ops_xpu_bf16.end());
+#elif defined(PADDLE_WITH_MLU)
+  auto unsupported_ops_mlu_fp16 = std::get<2>(
+      OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16));
+  unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(),
+                                unsupported_ops_mlu_fp16.end());
+  auto unsupported_ops_mlu_bf16 = std::get<2>(
+      OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16));
+  unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(),
+                                unsupported_ops_mlu_bf16.end());
 #endif
   VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
           << unsupported_fp16_ops_->size() << " "
@@ -209,7 +218,10 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
   auto data_type = GetDataType<VarType>(var);
   if (paddle::platform::is_gpu_place(place) ||
       paddle::platform::is_cuda_pinned_place(place) ||
-      paddle::platform::is_xpu_place(place)) {
+      paddle::platform::is_xpu_place(place) ||
+      paddle::platform::is_mlu_place(place) ||
+      paddle::platform::is_npu_place(place) ||
+      paddle::platform::is_npu_pinned_place(place)) {
     // CudaPinndePlace is added for varbase created by dataloader
     if (data_type == paddle::framework::proto::VarType::FP32 ||
         data_type == paddle::framework::proto::VarType::FP16 ||
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 7416d206fc43e..d7478b18dba06 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -389,6 +389,9 @@ static void PerformBackwardInplace(const std::string& op_type,
 }
 
 void BasicEngine::Execute() {
+  platform::RecordEvent backward_record_event(
+      "backward", platform::TracerEventType::Operator, 1);
+
   if (init_nodes_.empty()) {
     return;
   }
@@ -412,7 +415,7 @@ void BasicEngine::Execute() {
 
     for (auto& cur_op : *shared_cur_node) {
       platform::RecordEvent op_type_record_event(
-          cur_op.Type(), platform::TracerEventType::Operator, 1);
+          cur_op.Type() + " grad_node", platform::TracerEventType::Operator, 1);
 
       ++op_num;
 
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 12aa13bbacc3b..499cf4d8ad6d8 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -423,7 +423,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
   if (data_type == framework::proto::VarType::BF16) {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       return TensorAddImpl<platform::CUDADeviceContext, platform::bfloat16>(
           src_tensor, dst_tensor, place);
 #else
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index bae49fb381a47..a427b9b819911 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   auto& kernels = kernels_iter->second;
   auto kernel_iter = kernels.find(expected_kernel_key);
 
-#ifdef PADDLE_WITH_XPU
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() || is_xpu_unsupport)) {
     VLOG(3) << "missing XPU kernel: " << op.Type()
@@ -243,29 +243,36 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
   }
-
 #endif
 
 #ifdef PADDLE_WITH_XPU_KP
-  expected_kernel_key.place_ = platform::XPUPlace();
-  bool use_xpu_kp_kernel_rt =
-      FLAGS_run_kp_kernel &&
-      paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
-  bool use_xpu_kp_kernel_debug =
-      paddle::platform::is_in_xpu_kpwhite_list(op.Type());
-  if (use_xpu_kp_kernel_rt) {
-    VLOG(3) << "xpu_kp using rt mode ";
-  }
-  if (use_xpu_kp_kernel_debug) {
-    VLOG(3) << "xpu_kp using debug mode ";
-  }
-  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
-      (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) {
-    expected_kernel_key.place_ = platform::XPUPlace();
-    expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
-    kernel_iter = kernels.find(expected_kernel_key);
-    VLOG(3) << "using XPU KP kernel: " << op.Type()
-            << ", using_kernel_key:" << expected_kernel_key;
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
+    bool use_xpu_kp_kernel_rt =
+        FLAGS_run_kp_kernel &&
+        paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key);
+    bool use_xpu_kp_kernel_debug =
+        paddle::platform::is_in_xpu_kpwhite_list(op.Type());
+    if (use_xpu_kp_kernel_rt) {
+      VLOG(3) << "xpu_kp using rt mode ";
+    }
+    if (use_xpu_kp_kernel_debug) {
+      VLOG(3) << "xpu_kp using debug mode ";
+    }
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+    if (is_xpu_kp_support) {
+      expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
+      kernel_iter = kernels.find(expected_kernel_key);
+      VLOG(3) << "using XPU KP kernel: " << op.Type()
+              << ", using_kernel_key:" << expected_kernel_key;
+    }
+    if (!is_xpu_kp_support &&
+        (kernel_iter == kernels.end() || is_xpu_unsupport)) {
+      VLOG(3) << "missing XPU kernel: " << op.Type()
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
   }
 #endif
 
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 91e6974fa2edd..9daac181d57de 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -289,14 +289,23 @@ void BuildDygraphPhiKernelContext(
       auto& var = ins_vector[offset]->Var();
       if (var.template IsType<phi::DenseTensor>()) {
         tensor_in = &(var.template Get<phi::DenseTensor>());
+        kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var.template IsType<phi::SelectedRows>()) {
         tensor_in = &(var.template Get<phi::SelectedRows>());
+        kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var.template IsType<framework::LoDTensorArray>()) {
+        paddle::SmallVector<const phi::TensorBase*> tensor_vector;
+        auto& tensor_array = var.template Get<framework::LoDTensorArray>();
+        for (auto& t : tensor_array) {
+          tensor_vector.emplace_back(&t);
+        }
+        kernel_ctx->EmplaceBackInputsWithoutSetRange(tensor_vector);
+        end_idx += tensor_array.size() - 1;
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported input `%s` type when call pt kernel.",
             framework::ToTypeName(var.Type())));
       }
-      kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
     }
     kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
@@ -326,22 +335,32 @@ void BuildDygraphPhiKernelContext(
       if (var) {
         if (var->template IsType<phi::DenseTensor>()) {
           tensor_out = var->template GetMutable<phi::DenseTensor>();
+          kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<phi::SelectedRows>()) {
           tensor_out = var->template GetMutable<phi::SelectedRows>();
+          kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<framework::LoDTensorArray>()) {
+          paddle::SmallVector<phi::TensorBase*> tensor_vector;
+          auto* tensor_array =
+              var->template GetMutable<framework::LoDTensorArray>();
+          for (auto& t : *tensor_array) {
+            tensor_vector.emplace_back(&t);
+          }
+          kernel_ctx->EmplaceBackOutputsWithoutSetRange(tensor_vector);
+          end_idx += tensor_array->size() - 1;
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported output `%s` type when call pt kernel.",
               framework::ToTypeName(var->Type())));
         }
+      } else {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
       }
-
-      kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
-    VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
     if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
       if (attrs.find(attr_names[i]) !=
           attrs.end()) {  // shape is in the attribute
@@ -419,6 +438,17 @@ void BuildDygraphPhiKernelContext(
             experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
       }
 
+    } else if (ins.find(attr_names[i]) != ins.end()) {
+      // deal tensor attr here
+      auto& ins_vector = ins.at(attr_names[i]);
+      auto tensor_attr =
+          experimental::MakePhiScalarFromVar(ins_vector[0]->Var());
+      if (attr_defs[i].type_index == std::type_index(typeid(int))) {
+        int val = tensor_attr.template to<int>();
+        kernel_ctx->EmplaceBackAttr(val);
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented("only support int here"));
+      }
     } else if (attr_defs[i].type_index ==
                std::type_index(typeid(std::vector<phi::Scalar>))) {
       auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
@@ -475,6 +505,7 @@ void BuildDygraphPhiKernelContext(
       }
     } else {
       // TODO(chenweihang): support other attrs later
+
       auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
@@ -510,6 +541,10 @@ void BuildDygraphPhiKernelContext(
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int>))) {
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector<int>, attr));
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(std::vector<std::string>))) {
+        kernel_ctx->EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<std::string>, attr));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported cast op attribute `%s` when construct "
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index fec9afbf3b403..03fa46eab5367 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -1109,8 +1109,9 @@ void Reducer::FinalizeBackward() {
 
   if (find_unused_vars_each_step_) {
 // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||      \
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
+    defined(PADDLE_WITH_CNCL)
     ProcessUnusedDenseVars();
 #endif
     // Initialize local used vars
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index f754c6fdd0ee7..75876e07fb5c7 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -32,6 +32,8 @@
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 01c9d2847e0c8..3d4cfa2df3179 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -176,8 +176,22 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
                      const std::map<std::string, std::string>& inplace_map,
                      paddle::framework::AttributeMap* passed_default_attrs_,
                      bool use_default_attr_map) {
+  TraceOpImpl<VarType>(type, ins, outs, attrs, place, trace_backward,
+                       inplace_map, passed_default_attrs_,
+                       use_default_attr_map);
+}
+
+template <typename VarType>
+void Tracer::TraceOpImpl(const std::string& type,
+                         const NameVarMap<VarType>& ins,
+                         const NameVarMap<VarType>& outs,
+                         framework::AttributeMap& attrs,
+                         const platform::Place& place, bool trace_backward,
+                         const std::map<std::string, std::string>& inplace_map,
+                         paddle::framework::AttributeMap* passed_default_attrs_,
+                         bool use_default_attr_map) {
   platform::RecordEvent op_type_record_event(
-      type, platform::TracerEventType::Operator, 1);
+      type + " trace_op", platform::TracerEventType::Operator, 1);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
@@ -297,19 +311,24 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
     program_desc_tracer_->InsertOp(type, new_ins, outs, attrs);
   }
 
-  if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
-    PADDLE_ENFORCE_EQ(
-        passed_default_attrs_, nullptr,
-        paddle::platform::errors::PermissionDenied(
-            "We expect passed_default_attrs_ is nullptr while "
-            "use_default_attr_map is true, however we got not null "
-            "passed_default_attrs_. Please check your usage of trace_op. "));
-    CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
-                     inplace_map);
-  } else {
-    VLOG(3) << "No Grad to track for Op: " << type;
+  {
+    platform::RecordEvent node_creation_record_event(
+        type + " node_creation", platform::TracerEventType::Operator, 1);
+
+    if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
+      PADDLE_ENFORCE_EQ(
+          passed_default_attrs_, nullptr,
+          paddle::platform::errors::PermissionDenied(
+              "We expect passed_default_attrs_ is nullptr while "
+              "use_default_attr_map is true, however we got not null "
+              "passed_default_attrs_. Please check your usage of trace_op. "));
+      CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
+                       inplace_map);
+    } else {
+      VLOG(3) << "No Grad to track for Op: " << type;
+    }
+    VLOG(6) << "Finish Trace Op: " << type;
   }
-  VLOG(6) << "Finish Trace Op: " << type;
 }
 
 template void Tracer::TraceOp<VarBase>(
@@ -335,25 +354,33 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
 
 void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
                      const NameTensorMap& outs,
-                     paddle::framework::AttributeMap attrs,
+                     paddle::framework::AttributeMap& attrs,
                      const paddle::platform::Place& place,
                      paddle::framework::AttributeMap* default_attrs,
                      bool use_default_attr_map,
                      const std::map<std::string, std::string>& inplace_map) {
   VLOG(6) << "Running On Eager TraceOp with use_default_attr_map: "
           << use_default_attr_map;
-  TraceOp<egr::EagerVariable>(type, ins, outs, std::move(attrs), place, false,
-                              inplace_map, default_attrs, use_default_attr_map);
+  TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, place, false,
+                                  inplace_map, default_attrs,
+                                  use_default_attr_map);
+}
+
+void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
+                     const NameTensorMap& outs,
+                     paddle::framework::AttributeMap attrs) {
+  VLOG(6) << "Running On Eager TraceOp(4 agrs): ";
+  TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, expected_place_,
+                                  false, {}, nullptr, true);
 }
 
 void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
                      const NameTensorMap& outs,
-                     paddle::framework::AttributeMap attrs,
+                     paddle::framework::AttributeMap& attrs,
                      const std::map<std::string, std::string>& inplace_map) {
   VLOG(6) << "Running On Eager TraceOp(less): ";
-  TraceOp<egr::EagerVariable>(type, ins, outs, std::move(attrs),
-                              expected_place_, false, inplace_map, nullptr,
-                              true);
+  TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, expected_place_,
+                                  false, inplace_map, nullptr, true);
 }
 
 void Tracer::SetExpectedPlace(platform::Place place) {
@@ -385,8 +412,8 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins,
 }
 
 phi::KernelSignature Tracer::GetExpectedKernelSignature(
-    const std::string& type, const NameVarBaseMap& ins,
-    const NameVarBaseMap& outs, framework::AttributeMap attrs) const {
+    const std::string& type, const NameTensorMap& ins,
+    const NameTensorMap& outs, framework::AttributeMap attrs) const {
   auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
   framework::RuntimeContext ctx({}, {});
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -401,7 +428,7 @@ phi::KernelSignature Tracer::GetExpectedKernelSignature(
       attr_checker == nullptr ? empty_attrs_map
                               : attr_checker->GetDefaultAttrMap();
   auto dygraph_exe_ctx =
-      imperative::DygraphExecutionContext<imperative::VarBase>(
+      imperative::DygraphExecutionContext<egr::EagerVariable>(
           *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs,
           default_attrs);
   auto* opbase_with_kernel =
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index fd13fce6a6e17..4e671d52457e2 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -74,16 +74,32 @@ class Tracer {
                paddle::framework::AttributeMap* passed_default_attrs_ = nullptr,
                bool use_default_attr_map = true);
 
+  template <typename VarType>
+  void TraceOpImpl(
+      const std::string& type, const NameVarMap<VarType>& ins,
+      const NameVarMap<VarType>& outs,
+      framework::AttributeMap& attrs,  // NOLINT
+      const platform::Place& place, bool trace_backward,
+      const std::map<std::string, std::string>& inplace_map = {},
+      paddle::framework::AttributeMap* passed_default_attrs_ = nullptr,
+      bool use_default_attr_map = true);
+
   void TraceOp(const std::string& type, const NameVarBaseMap& ins,
                const NameVarBaseMap& outs, framework::AttributeMap attrs,
                const std::map<std::string, std::string>& inplace_map = {});
 
   void TraceOp(const std::string& type, const NameTensorMap& ins,
-               const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
+               const NameTensorMap& outs,
+               paddle::framework::AttributeMap& attrs,  // NOLINT
                const std::map<std::string, std::string>& inplace_map = {});
 
   void TraceOp(const std::string& type, const NameTensorMap& ins,
-               const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
+               const NameTensorMap& outs,
+               paddle::framework::AttributeMap attrs);
+
+  void TraceOp(const std::string& type, const NameTensorMap& ins,
+               const NameTensorMap& outs,
+               paddle::framework::AttributeMap& attrs,  // NOLINT
                const paddle::platform::Place& place,
                paddle::framework::AttributeMap* default_attrs,
                bool use_default_attr_map,
@@ -156,8 +172,8 @@ class Tracer {
   }
 
   phi::KernelSignature GetExpectedKernelSignature(
-      const std::string& type, const NameVarBaseMap& ins,
-      const NameVarBaseMap& outs, framework::AttributeMap attrs) const;
+      const std::string& type, const NameTensorMap& ins,
+      const NameTensorMap& outs, framework::AttributeMap attrs) const;
 
   paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
       const platform::Place& place);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index a5c32164bf1a2..74e8ca3f229c6 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -188,6 +188,9 @@ struct Argument {
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
   DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
+  DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool);
+  DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes,
+                      std::unordered_set<std::string>);
 
   // Usually use for trt dynamic shape.
   // TRT will select the best kernel according to opt shape
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 796c86a3ad1ef..287c896e49bf2 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -189,6 +189,10 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new int(argument->dlnne_min_subgraph_size()));
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
+    } else if (pass_name == "mixed_precision_configure_pass") {
+      pass->Set("gpu_fp16_disabled_op_types",
+                new std::unordered_set<std::string>(
+                    argument->gpu_fp16_disabled_op_types()));
     }
     if (pass_name == "lite_subgraph_pass") {
       bool lite_enable_int8 =
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index daa18d8c78bf8..614eea24a0e2e 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -65,6 +66,26 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
 
 #else
 
+void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap(
+    const framework::ir::Graph &graph,
+    std::unordered_map<std::string, std::string> *var_name_op_type_map) {
+  std::vector<framework::ir::Node *> node_list =
+      framework::ir::TopologyVarientSort(
+          graph, static_cast<framework::ir::SortKind>(0));
+  for (auto *op_node : node_list) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    for (auto *pre_node : op_node->inputs) {
+      if (pre_node->IsVar() && pre_node->Var()->Persistable()) {
+        var_name_op_type_map->insert(std::pair<std::string, std::string>(
+            pre_node->Var()->Name(), op_node->Op()->Type()));
+      }
+    }
+  }
+}
+
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
@@ -102,6 +123,16 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   if (with_dynamic_shape) {
     reserve_cpu_weights = true;
   }
+
+  bool mixed_precision_mode =
+      argument->Has("use_gpu_fp16") && argument->use_gpu_fp16();
+  std::unordered_map<std::string, std::string> var_name_op_type_map{};
+  std::unordered_set<std::string> blacklist{};
+  if (mixed_precision_mode) {
+    GetVarNameToOpTypeMap(graph, &var_name_op_type_map);
+    blacklist = argument->gpu_fp16_disabled_op_types();
+  }
+
   for (auto &var_name : all_vars) {
     if (std::count(repetitive_params.begin(), repetitive_params.end(),
                    var_name)) {
@@ -117,18 +148,29 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
         var->IsType<framework::Tensor>()) {
       auto *t = var->GetMutable<framework::LoDTensor>();
 
-      platform::CPUPlace cpu_place;
-      framework::LoDTensor temp_tensor;
-      temp_tensor.Resize(t->dims());
-      temp_tensor.mutable_data<float>(cpu_place);
-
-      // Copy the parameter data to a tmp tensor.
-      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
-      // Reallocation the space on GPU
-      t->clear();
-
-      // Copy parameter data to newly allocated GPU space.
-      paddle::framework::TensorCopySync(temp_tensor, place, t);
+      bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 ||
+                      t->dtype() == paddle::experimental::DataType::FLOAT64;
+      if (mixed_precision_mode &&
+          !blacklist.count(var_name_op_type_map[var_name]) && is_float) {
+        framework::Tensor half_tensor;
+        half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
+        half_tensor.Resize(t->dims());
+        auto *half_data =
+            half_tensor.mutable_data<float16>(platform::CPUPlace());
+        for (int i = 0; i < t->numel(); i++) {
+          auto *data = t->mutable_data<float>(platform::CPUPlace());
+          half_data[i] = static_cast<float16>(data[i]);
+        }
+        t->clear();
+        paddle::framework::TensorCopySync(half_tensor, place, t);
+      } else {
+        platform::CPUPlace cpu_place;
+        framework::LoDTensor temp_tensor;
+        temp_tensor.Resize(t->dims());
+        paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
+        t->clear();
+        paddle::framework::TensorCopySync(temp_tensor, place, t);
+      }
     }
   }
 }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index d5e98ec886e65..f8209f051d534 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -38,7 +38,12 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
 #ifdef PADDLE_WITH_ASCEND_CL
   void CopyParamsToNpu(Argument *argument);
 #else
-  void CopyParamsToGpu(Argument *argument);
+
+  void GetVarNameToOpTypeMap(
+      const framework::ir::Graph& graph,
+      std::unordered_map<std::string, std::string>* var_name_op_type_map);
+
+  void CopyParamsToGpu(Argument* argument);
 #endif
 };
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 41c01d3b7e261..d08d28a3f6233 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -83,6 +83,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
 
   Update();
 }
+
 void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                   int device_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -97,12 +98,26 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
 
   Update();
 }
+
 void AnalysisConfig::DisableGpu() {
   use_gpu_ = false;
 
   Update();
 }
 
+void AnalysisConfig::Exp_EnableUseGpuFp16(
+    std::unordered_set<std::string> op_list) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  use_gpu_fp16_ = true;
+  gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end());
+#else
+  LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()";
+  use_gpu_fp16_ = false;
+#endif
+
+  Update();
+}
+
 void AnalysisConfig::DisableFCPadding() {
   use_fc_padding_ = false;
 
@@ -213,6 +228,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(use_cudnn_);
   CP_MEMBER(gpu_device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
+  CP_MEMBER(use_gpu_fp16_);
+  CP_MEMBER(gpu_fp16_disabled_op_types_);
 
   CP_MEMBER(enable_memory_optim_);
   // TensorRT related.
@@ -573,6 +590,20 @@ void AnalysisConfig::Update() {
 #endif
   }
 
+  if (use_gpu_fp16_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (!enable_ir_optim_) {
+      LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is "
+                    "enabled.";
+    } else if (!use_gpu()) {
+      LOG(ERROR)
+          << "Exp_EnableUseGpuFp16() only works when use_gpu is enabled.";
+    } else {
+      pass_builder()->Exp_EnableUseGpuFp16();
+    }
+#endif
+  }
+
   if (use_mkldnn_) {
 #ifdef PADDLE_WITH_MKLDNN
     if (!enable_ir_optim_) {
@@ -669,6 +700,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << params_file_;
 
   ss << use_gpu_;
+  ss << use_gpu_fp16_;
+  for (auto &item : gpu_fp16_disabled_op_types_) ss << item;
   ss << use_fc_padding_;
   ss << gpu_device_id_;
   ss << xpu_device_id_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 871ed596a3ee9..a7caa3e369f80 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -50,8 +50,7 @@
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/utils/string/split.h"
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -374,8 +373,7 @@ static void DisablePrepareDataOpt(
 }
 
 bool AnalysisPredictor::PrepareExecutor() {
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     VLOG(3) << "use_dist_model is enabled, will init FleetExecutor.";
     return PrepareFleetExecutor();
@@ -393,8 +391,7 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 bool AnalysisPredictor::PrepareFleetExecutor() {
   VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()";
   if (config_.dist_config().nranks() > 1 && !CommInit()) {
@@ -872,6 +869,11 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
   }
 
+  if (config_.gpu_fp16_enabled()) {
+    argument_.SetUseGPUFp16(true);
+    argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_);
+  }
+
   if (config_.lite_engine_enabled()) {
     argument_.SetCpuMathLibraryNumThreads(
         config_.cpu_math_library_num_threads());
@@ -1189,8 +1191,7 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
   framework::Scope *scope;
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
   } else {
@@ -1239,8 +1240,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
   framework::Scope *scope;
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     scope = scope_.get();
   } else {
@@ -1287,8 +1287,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 }
 
 bool AnalysisPredictor::ZeroCopyRun() {
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
     VLOG(3) << "ZeroCopyRun will use the fleet executor.";
     inference::Timer timer;
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 21a7e9658bbee..d9992f3fbef9d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,8 +18,7 @@
 #include <memory>
 #include <string>
 #include <vector>
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #endif
 #include "paddle/fluid/framework/naive_executor.h"
@@ -395,8 +394,7 @@ class AnalysisPredictor : public PaddlePredictor {
   void StatisticShapeRangeInfo();
   void CollectShapeRangeInfo();
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // fleet exe related
 
   ///
@@ -488,8 +486,7 @@ class AnalysisPredictor : public PaddlePredictor {
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
   static int clone_num_;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
-    !defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // fleet executor related
   distributed::FleetExecutorDesc executor_desc_;
   std::shared_ptr<distributed::FleetExecutor> fleet_exe_;
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 2c6e8f4f1a4d9..ecb5eaf982548 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -375,6 +375,19 @@ TEST(AnalysisPredictor, enable_onnxruntime) {
   ASSERT_TRUE(!config.use_onnxruntime());
 }
 
+TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) {
+  AnalysisConfig config;
+  config.SwitchIrOptim();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  config.EnableUseGpu(100, 0);
+  config.Exp_EnableUseGpuFp16();
+  ASSERT_TRUE(config.gpu_fp16_enabled());
+#else
+  config.DisableGpu();
+#endif
+  LOG(INFO) << config.Summary();
+}
+
 }  // namespace paddle
 
 namespace paddle_infer {
@@ -434,6 +447,19 @@ TEST(Predictor, EnableONNXRuntime) {
   auto predictor = CreatePredictor(config);
 }
 
+TEST(Predictor, Exp_EnableUseGpuFp16) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  config.EnableUseGpu(100, 0);
+  config.Exp_EnableUseGpuFp16();
+#else
+  config.DisableGpu();
+#endif
+  auto predictor = CreatePredictor(config);
+}
+
 TEST(Tensor, CpuShareExternalData) {
   Config config;
   config.SetModel(FLAGS_dirname);
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 79a31555c7f0b..2c0945cd5b386 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -53,7 +53,11 @@ if [ $7 == ON ]; then
   if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then
     echo "MobileNetV2.inference.model.tar.gz has been downloaded."
   else
-    wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
+    if [ $WIN_DETECT != "" ]; then
+      wget -q -Y off http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
+    else
+      wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz
+    fi
     tar xzf *.tar.gz
   fi
   cd ..
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
index 4341fb0a9ccd8..b2cfb060dd325 100644
--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -14,7 +14,11 @@
 #
 
 cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope)
-cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+if (WITH_ONNXRUNTIME)
+    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce onnxruntime)
+else (WITH_ONNXRUNTIME)
+    cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+endif (WITH_ONNXRUNTIME)
 cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
 
 cc_test(zero_copy_tensor_test SRCS zero_copy_tensor_test.cc DEPS paddle_inference_api)
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 18b1d09f0e8a7..66dec0157d98e 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -22,12 +22,22 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/core/allocator.h"
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "paddle/fluid/inference/api/onnxruntime_predictor.h"
+#endif
 
 namespace paddle_infer {
 
 using float16 = paddle::platform::float16;
 
 void Tensor::Reshape(const std::vector<int> &shape) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    shape_.assign(shape.begin(), shape.end());
+    return;
+  }
+#endif
+
   PADDLE_ENFORCE_EQ(
       name_.empty(), false,
       paddle::platform::errors::PreconditionNotMet(
@@ -123,6 +133,11 @@ T *Tensor::data(PlaceType *place, int *size) const {
 }
 
 DataType Tensor::type() const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    return dtype_;
+  }
+#endif
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   auto type = paddle::framework::TransToProtoVarType(tensor->dtype());
   if (type == paddle::framework::proto::VarType::FP32) {
@@ -145,6 +160,13 @@ PlaceType Tensor::place() const { return place_; }
 
 template <typename T>
 void Tensor::CopyFromCpu(const T *data) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    ORTCopyFromCpu<T>(data);
+    return;
+  }
+#endif
+
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   PADDLE_ENFORCE_GE(tensor->numel(), 0,
                     paddle::platform::errors::PreconditionNotMet(
@@ -382,6 +404,13 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 
 template <typename T>
 void Tensor::CopyToCpu(T *data) const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    ORTCopyToCpu<T>(data);
+    return;
+  }
+#endif
+
   CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
 }
 
@@ -489,12 +518,7 @@ template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
 template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
 template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
 
-Tensor::Tensor(void *scope) : scope_{scope} {
-  PADDLE_ENFORCE_NOT_NULL(scope_,
-                          paddle::platform::errors::PreconditionNotMet(
-                              "The `scope` can not be nullptr. It should be "
-                              "set to the pointer of scope."));
-}
+Tensor::Tensor(void *scope) : scope_{scope} {}
 
 template <typename T>
 void *Tensor::FindTensor() const {
@@ -513,6 +537,26 @@ void *Tensor::FindTensor() const {
 }
 
 std::vector<int> Tensor::shape() const {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    std::vector<int> shape;
+    // input handle
+    if (idx_ < 0) {
+      shape.assign(shape_.begin(), shape_.end());
+    } else {  // output handle
+      auto binding = binding_.lock();
+      PADDLE_ENFORCE_NOT_NULL(binding,
+                              paddle::platform::errors::PreconditionNotMet(
+                                  "output tensor [%s] no binding ptr", name_));
+      std::vector<Ort::Value> outputs = binding->GetOutputValues();
+      Ort::Value &value = outputs[idx_];
+      auto info = value.GetTensorTypeAndShapeInfo();
+      auto ort_shape = info.GetShape();
+      shape.assign(ort_shape.begin(), ort_shape.end());
+    }
+    return shape;
+  }
+#endif
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   PADDLE_ENFORCE_NOT_NULL(
       tensor_, paddle::platform::errors::PreconditionNotMet(
@@ -573,4 +617,99 @@ void Tensor::SetPlace(PlaceType place, int device) {
   device_ = device;
 }
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+void Tensor::SetOrtMark(bool is_ort_tensor) { is_ort_tensor_ = is_ort_tensor; }
+
+void Tensor::SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding) {
+  binding_ = binding;
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<float>(memory_info, data, size, shape,
+                                         shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int64_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int64_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int32_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int32_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, uint8_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<uint8_t>(memory_info, data, size, shape,
+                                           shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int8_t *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor<int8_t>(memory_info, data, size, shape,
+                                          shape_len);
+}
+
+Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float16 *data,
+                       size_t size, const int64_t *shape, size_t shape_len) {
+  return Ort::Value::CreateTensor(memory_info, static_cast<void *>(data),
+                                  size * sizeof(float16), shape, shape_len,
+                                  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+}
+
+template <typename T>
+void Tensor::ORTCopyFromCpu(const T *data) {
+  auto binding = binding_.lock();
+  PADDLE_ENFORCE_NOT_NULL(binding,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "input tensor [%s] no binding ptr", name_));
+  const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, device_,
+                              OrtMemTypeDefault);
+  size_t size = std::accumulate(begin(shape_), end(shape_), 1UL,
+                                std::multiplies<size_t>());
+  auto ort_value = GetOrtVaule(memory_info, const_cast<T *>(data), size,
+                               shape_.data(), shape_.size());
+  binding->BindInput(name_.c_str(), ort_value);
+}
+
+template <typename T>
+void Tensor::ORTCopyToCpu(T *data) const {
+  auto binding = binding_.lock();
+  PADDLE_ENFORCE_NOT_NULL(binding,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "output tensor [%s] no binding ptr", name_));
+  std::vector<Ort::Value> outputs = binding->GetOutputValues();
+  Ort::Value &value = outputs[idx_];
+  auto info = value.GetTensorTypeAndShapeInfo();
+  size_t size = info.GetElementCount() * sizeof(T);
+
+  if (place_ == PlaceType::kCPU) {
+    std::memcpy(static_cast<void *>(data), value.GetTensorData<void *>(), size);
+  } else {
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                         static_cast<void *>(data),
+                         paddle::platform::CUDAPlace(device_),
+                         value.GetTensorData<void>(), size, nullptr);
+  }
+}
+
+template void Tensor::ORTCopyFromCpu<float>(const float *data);
+template void Tensor::ORTCopyFromCpu<int64_t>(const int64_t *data);
+template void Tensor::ORTCopyFromCpu<int32_t>(const int32_t *data);
+template void Tensor::ORTCopyFromCpu<uint8_t>(const uint8_t *data);
+template void Tensor::ORTCopyFromCpu<int8_t>(const int8_t *data);
+template void Tensor::ORTCopyFromCpu<float16>(const float16 *data);
+
+template void Tensor::ORTCopyToCpu<float>(float *data) const;
+template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
+template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
+template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
+template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
+#endif
+
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
index ee82da139d8f3..bd9de252a0962 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -25,11 +25,7 @@
 #include <vector>
 
 #include "paddle/fluid//platform/device/gpu/gpu_types.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
@@ -45,24 +41,23 @@
 
 namespace paddle {
 
-framework::proto::VarType::Type ConvertONNXType(
-    ONNXTensorElementDataType type) {
+paddle_infer::DataType ConvertONNXType(ONNXTensorElementDataType type) {
   switch (type) {
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
-      return framework::proto::VarType::FP32;
-    // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
-    //   return DataType::FP16;
+      return paddle_infer::DataType::FLOAT32;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+      return paddle_infer::DataType::FLOAT16;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
-      return framework::proto::VarType::INT8;
+      return paddle_infer::DataType::INT8;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
-      return framework::proto::VarType::INT32;
+      return paddle_infer::DataType::INT32;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
-      return framework::proto::VarType::INT64;
+      return paddle_infer::DataType::INT64;
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
-      return framework::proto::VarType::UINT8;
+      return paddle_infer::DataType::UINT8;
     default:
       LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast<int>(type);
-      return framework::proto::VarType::FP32;
+      return paddle_infer::DataType::FLOAT32;
   }
 }
 
@@ -87,13 +82,12 @@ bool ONNXRuntimePredictor::Init() {
   VLOG(3) << "ONNXRuntime Predictor::init()";
 
   // Now ONNXRuntime only suuport CPU
+  const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
   if (config_.use_gpu()) {
     place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
   } else {
     place_ = paddle::platform::CPUPlace();
   }
-  scope_.reset(new paddle::framework::Scope());
-  sub_scope_ = &scope_->NewScope();
 
   std::string onnx_proto;
   paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto,
@@ -125,13 +119,12 @@ bool ONNXRuntimePredictor::Init() {
                "generated.";
   }
   session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options};
+  binding_ = std::make_shared<Ort::IoBinding>(session_);
 
-  auto memory_info =
-      Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
+                              place_.GetDeviceId(), OrtMemTypeDefault);
   Ort::Allocator allocator(session_, memory_info);
 
-  framework::proto::VarType::Type proto_type =
-      framework::proto::VarType::LOD_TENSOR;
   size_t n_inputs = session_.GetInputCount();
   for (size_t i = 0; i < n_inputs; ++i) {
     auto input_name = session_.GetInputName(i, allocator);
@@ -141,8 +134,6 @@ bool ONNXRuntimePredictor::Init() {
     ONNXTensorElementDataType data_type =
         type_info.GetTensorTypeAndShapeInfo().GetElementType();
     input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type});
-    auto *ptr = scope_->Var(input_name);
-    framework::InitializeVariable(ptr, proto_type);
     allocator.Free(input_name);
   }
 
@@ -155,11 +146,13 @@ bool ONNXRuntimePredictor::Init() {
     ONNXTensorElementDataType data_type =
         type_info.GetTensorTypeAndShapeInfo().GetElementType();
     output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type});
-    auto *ptr = scope_->Var(output_name);
-    framework::InitializeVariable(ptr, proto_type);
+
+    Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator,
+                                    place_.GetDeviceId(), OrtMemTypeDefault);
+    binding_->BindOutput(output_name, out_memory_info);
+
     allocator.Free(output_name);
   }
-
   return true;
 }
 
@@ -216,15 +209,26 @@ std::vector<std::string> ONNXRuntimePredictor::GetOutputNames() {
   return output_names;
 }
 
+bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name,
+                                        bool is_input) {
+  if (is_input) {
+    for (auto i : input_desc_)
+      if (i.name == name) return true;
+  } else {
+    for (auto i : output_desc_)
+      if (i.name == name) return true;
+  }
+  return false;
+}
+
 std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
-                          platform::errors::PreconditionNotMet(
-                              "The in variable named %s is not found in the "
-                              "scope of the ONNXPredictor.",
-                              name));
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true,
+                    platform::errors::PreconditionNotMet(
+                        "The in variable named %s is not found in the "
+                        "ONNXPredictor.",
+                        name));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
   res->input_or_output_ = true;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -233,18 +237,19 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
   }
+  res->SetOrtMark(true);
+  res->SetOrtBinding(binding_);
   return res;
 }
 
 std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
-                          platform::errors::PreconditionNotMet(
-                              "The out variable named %s is not found in the "
-                              "scope of the ONNXPredictor.",
-                              name));
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(scope_.get())));
+  PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true,
+                    platform::errors::PreconditionNotMet(
+                        "The out variable named %s is not found in the "
+                        "ONNXPredictor.",
+                        name));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
   res->input_or_output_ = false;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -253,46 +258,18 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
   }
+  res->SetOrtMark(true);
+  res->SetOrtBinding(binding_);
+  int size = output_desc_.size();
+  for (int i = 0; i < size; ++i)
+    if (output_desc_[i].name == name) {
+      res->idx_ = i;
+      res->dtype_ = ConvertONNXType(output_desc_[i].dtype);
+      break;
+    }
   return res;
 }
 
-Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc,
-                                             const char *device_name) {
-  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
-                              place_.GetDeviceId(), OrtMemTypeDefault);
-  auto *var = scope_->FindVar(desc.name);
-  auto *tensor = var->GetMutable<framework::LoDTensor>();
-  size_t size =
-      tensor->numel() *
-      framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype()));
-  std::vector<int64_t> shape = phi::vectorize<int64_t>(tensor->dims());
-  return Ort::Value::CreateTensor(memory_info,
-                                  static_cast<void *>(tensor->data()), size,
-                                  shape.data(), shape.size(), desc.dtype);
-}
-
-void ONNXRuntimePredictor::AsTensor(const Ort::Value &value,
-                                    const ONNXDesc &desc) {
-  auto info = value.GetTensorTypeAndShapeInfo();
-
-  auto *var = scope_->FindVar(desc.name);
-  auto *tensor = var->GetMutable<framework::LoDTensor>();
-  tensor->Resize(phi::make_ddim(info.GetShape()));
-  auto dtype = ConvertONNXType(info.GetElementType());
-  auto *ptr = tensor->mutable_data(place_, dtype);
-
-  if (platform::is_cpu_place(place_)) {
-    std::memcpy(ptr, const_cast<void *>(value.GetTensorData<void>()),
-                tensor->numel() * framework::SizeOfType(dtype));
-  } else {
-    auto src_place = place_;
-    auto dst_place = place_;
-    memory::Copy(dst_place, ptr, src_place,
-                 const_cast<void *>(value.GetTensorData<void>()),
-                 tensor->numel() * framework::SizeOfType(dtype));
-  }
-}
-
 bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                std::vector<PaddleTensor> *output_data,
                                int batch_size) {
@@ -302,31 +279,7 @@ bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
 
 bool ONNXRuntimePredictor::ZeroCopyRun() {
   try {
-    Ort::IoBinding binding(session_);
-    std::vector<Ort::Value> inputs;
-    std::vector<Ort::Value> outputs;
-    Ort::RunOptions options;
-
-    inputs.reserve(input_desc_.size());
-    const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu";
-    for (auto desc : input_desc_) {
-      inputs.push_back(GetOrtValue(desc, device_name));
-      binding.BindInput(desc.name.c_str(), inputs.back());
-    }
-
-    // TODO(heliqi): Optimization —— move to  Init()
-    for (auto desc : output_desc_) {
-      Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
-                                  place_.GetDeviceId(), OrtMemTypeDefault);
-      binding.BindOutput(desc.name.c_str(), memory_info);
-    }
-
-    session_.Run({}, binding);
-
-    outputs = binding.GetOutputValues();
-    for (size_t i = 0; i < output_desc_.size(); ++i) {
-      AsTensor(outputs[i], output_desc_[i]);
-    }
+    session_.Run({}, *(binding_.get()));
   } catch (const std::exception &e) {
     LOG(ERROR) << e.what();
     return false;
@@ -345,9 +298,9 @@ uint64_t ONNXRuntimePredictor::TryShrinkMemory() {
 }
 
 ONNXRuntimePredictor::~ONNXRuntimePredictor() {
-  if (sub_scope_) {
-    scope_->DeleteScope(sub_scope_);
-  }
+  binding_->ClearBoundInputs();
+  binding_->ClearBoundOutputs();
+
   memory::Release(place_);
 }
 
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
index 7fb07aa97bd27..d01756e4b96b1 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.h
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -94,9 +94,8 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   /// \param[in] AnalysisConfig config
   ///
   explicit ONNXRuntimePredictor(const AnalysisConfig &config)
-      : config_(config) {
+      : config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") {
     predictor_id_ = inference::GetUniqueId();
-    env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx");
   }
   ///
   /// \brief Destroy the ONNXRuntime Predictor object
@@ -177,30 +176,17 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   ///
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  std::shared_ptr<framework::Scope> scope_;
-
  private:
   ///
-  /// \brief get the Ort Value(input Tensor).
-  ///
-  /// \param[in] desc ONNXDesce(name、shape、dtype)
-  ///
-  /// \param[in] device_name "cpu" or "gpu" of device
-  ///
-  /// \return get a Ort::Value
-  ///
-  Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name);
-
-  ///
-  /// \brief Ort::Value to Paddle::ZeroCopyTensor.
+  /// \brief Whether to find in/out by name.
   ///
-  /// \param[in] value Ort::Value(output Tensor)
+  /// \param[in] name input or output name
   ///
-  /// \param[in] desc a ONNXDesce(name、shape、dtype)
+  /// \param[in] is_input input(true) or output(false)
   ///
-  /// \return get a Ort::Value
+  /// \return Whether to find by name
   ///
-  void AsTensor(const Ort::Value &value, const ONNXDesc &desc);
+  bool FindONNXDesc(const std::string &name, bool is_input);
 
  private:
   AnalysisConfig config_;
@@ -208,9 +194,9 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   // ONNXRuntime
   Ort::Env env_;
   Ort::Session session_{nullptr};
+  std::shared_ptr<Ort::IoBinding> binding_;
 
   platform::Place place_;
-  framework::Scope *sub_scope_{nullptr};
   std::vector<ONNXDesc> input_desc_;
   std::vector<ONNXDesc> output_desc_;
   int predictor_id_;
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 7b765e3fa8a24..bdfe0e46e9ca4 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -253,6 +253,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   ///
   void DisableGpu();
+  ///
+  /// \brief Enable GPU fp16 precision computation, in experimental state.
+  ///
+  /// \param op_list The operator type list.
+  ///
+  void Exp_EnableUseGpuFp16(std::unordered_set<std::string> op_list = {});
+  ///
+  /// \brief A boolean state telling whether the GPU fp16 precision is turned
+  /// on.
+  ///
+  /// \return bool Whether the GPU fp16 precision is turned on.
+  ///
+  bool gpu_fp16_enabled() const { return use_gpu_fp16_; }
 
   ///
   /// \brief Turn on XPU.
@@ -859,6 +872,9 @@ struct PD_INFER_DECL AnalysisConfig {
   int gpu_device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
   bool thread_local_stream_{false};
+  bool use_gpu_fp16_{false};
+  std::unordered_set<std::string> gpu_fp16_disabled_op_types_{
+      "conv2d_fusion", "conv2d", "roll", "strided_slice"};
 
   bool use_cudnn_{false};
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 22d9dedb32ebf..95975d8f2a892 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -172,6 +172,40 @@ void GpuPassStrategy::EnableCUDNN() {
   use_cudnn_ = true;
 }
 
+void GpuPassStrategy::Exp_EnableUseGpuFp16() {
+  passes_.assign({
+    "is_test_pass",                               //
+        "simplify_with_basic_ops_pass",           //
+        "conv_bn_fuse_pass",                      //
+        "conv_eltwiseadd_bn_fuse_pass",           //
+        "embedding_eltwise_layernorm_fuse_pass",  //
+        "multihead_matmul_fuse_pass_v2",          //
+        "gpu_cpu_squeeze2_matmul_fuse_pass",      //
+        "gpu_cpu_reshape2_matmul_fuse_pass",      //
+        "gpu_cpu_flatten2_matmul_fuse_pass",      //
+        "gpu_cpu_map_matmul_v2_to_mul_pass",      //
+        "gpu_cpu_map_matmul_v2_to_matmul_pass",   //
+        "gpu_cpu_map_matmul_to_mul_pass",         //
+        // "fc_fuse_pass",                        //
+        "fc_elementwise_layernorm_fuse_pass",  //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
+                           // guaranteed at least v7
+// cudnn8.0 has memory leak problem in conv + eltwise + act, so we
+// disable the pass.
+#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100)
+        "conv_elementwise_add_act_fuse_pass",   //
+        "conv_elementwise_add2_act_fuse_pass",  //
+#endif
+        "conv_elementwise_add_fuse_pass",      //
+#endif                                         //
+        "transpose_flatten_concat_fuse_pass",  //
+        "mixed_precision_configure_pass",      //
+        "runtime_context_cache_pass"           //
+  });
+
+  use_gpu_fp16_ = true;
+}
+
 void GpuPassStrategy::EnableMKLDNN() {
   LOG(ERROR) << "GPU not support MKLDNN yet";
 }
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 351cf71e5ca74..02290ed33ff1c 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -125,6 +125,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \brief Enable the use of cuDNN kernel.
   virtual void EnableCUDNN() {}
 
+  /// \brief Enable use gpu fp16 kernel.
+  virtual void Exp_EnableUseGpuFp16() {}
+
   /// \brief Enable the use of MKLDNN.
   /// The MKLDNN control exists in both CPU and GPU mode, because there can
   /// still be some CPU kernels running in GPU mode.
@@ -140,6 +143,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \return A bool variable implying whether we are in gpu mode.
   bool use_gpu() const { return use_gpu_; }
 
+  /// \brief Check if we are using gpu fp16 kernel.
+  /// \return A bool variable implying whether we are in gpu fp16 mode.
+  bool use_gpu_fp16() const { return use_gpu_fp16_; }
+
   /// \brief Check if we are using xpu.
   /// \return A bool variable implying whether we are in xpu mode.
   bool use_xpu() const { return use_xpu_; }
@@ -162,6 +169,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   bool use_npu_{false};
   bool use_ipu_{false};
   bool use_mkldnn_{false};
+  bool use_gpu_fp16_{false};
   /// \endcond
 };
 
@@ -223,6 +231,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
   /// \brief Enable the use of cuDNN kernel.
   void EnableCUDNN() override;
 
+  /// \brief Enable the use of gpu fp16 kernel.
+  void Exp_EnableUseGpuFp16() override;
+
   /// \brief Not supported in GPU mode yet.
   void EnableMKLDNN() override;
 
@@ -238,6 +249,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
  protected:
   /// \cond Protected
   bool use_cudnn_{false};
+  bool use_gpu_fp16_{false};
   /// \endcond
 };
 
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 5a98d109aed79..2afe2d32e2f60 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -18,6 +18,11 @@
 
 #include "paddle_infer_declare.h"  // NOLINT
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+#include "onnxruntime_c_api.h"    // NOLINT
+#include "onnxruntime_cxx_api.h"  // NOLINT
+#endif
+
 namespace paddle_infer {
 
 /// \brief  Experimental.
@@ -175,6 +180,23 @@ class PD_INFER_DECL Tensor {
   PlaceType place_;
   int device_;
 
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  bool is_ort_tensor_{false};
+  std::vector<int64_t> shape_;
+  std::weak_ptr<Ort::IoBinding> binding_;
+  int idx_{-1};
+
+  void SetOrtMark(bool is_ort_tensor);
+
+  void SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding);
+
+  template <typename T>
+  void ORTCopyFromCpu(const T* data);
+
+  template <typename T>
+  void ORTCopyToCpu(T* data) const;
+#endif
+
   friend class paddle_infer::contrib::TensorUtils;
 #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
   friend class paddle_infer::InferApiTesterUtils;
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index 67e7c78b62e9d..496e8932a690d 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/layer_norm_op.h"
+
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 7f7313fbcb596..1ad82df41737c 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -53,6 +53,6 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
 }  // namespace paddle
 
 USE_OP_ITSELF(relu);
-USE_OP(sigmoid);
-USE_OP(tanh);
+USE_OP_ITSELF(sigmoid);
+USE_OP_ITSELF(tanh);
 USE_OP(relu6);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
index b96992ef8514a..a856d14144469 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
 USE_OP_ITSELF(conv2d);
-USE_OP(conv2d_transpose);
+USE_OP_ITSELF(conv2d_transpose);
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
index 1725888abc379..f17e00de0eeb7 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
@@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) {
 }  // namespace paddle
 
 // USE_OP(leaky_relu);
-USE_OP(leaky_relu);
+USE_OP_ITSELF(leaky_relu);
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 861e98e443756..67d44184a76d0 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -17,7 +17,7 @@
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h"
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"
 
 namespace paddle {
 namespace inference {
@@ -83,7 +83,7 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
   cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
                   cudaMemcpyHostToDevice, stream);
 
-  paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+  phi::LayerNormDirectCUDAFunctor<float> layer_norm;
   layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
              variance_d, begin_norm_axis, eps);
   return cudaGetLastError() != cudaSuccess;
@@ -177,7 +177,7 @@ int LayerNormPluginDynamic::enqueue(
     cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
                     cudaMemcpyHostToDevice, stream);
 
-    paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+    phi::LayerNormDirectCUDAFunctor<float> layer_norm;
     layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
                variance_d, begin_norm_axis, eps);
   } else {
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 61e292a922f0e..abf7256475336 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -34,6 +34,7 @@
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
@@ -210,15 +211,36 @@ class AllocatorFacadePrivate {
         InitNaiveBestFitCPUAllocator();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
-        if (!FLAGS_use_stream_safe_cuda_allocator) {
-          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
-               ++dev_id) {
-            InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
-                                        allow_free_idle_chunk_);
-          }
+        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
+          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
+                                      allow_free_idle_chunk_);
+        }
+
+        // Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place
+        // -> Allocator) hold the StreamSafeCUDAAllocator releate to default
+        // stream (i.e., the stream directly got from DeviceContex), while the
+        // 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the
+        // StreamSafeCUDAAllocator releate to non-default stream (i.e., the
+        // stream users pass in). The default stream Allocator is built in the
+        // structure of AllocatorFacadePrivate, while the non-default stream is
+        // build in a delayed manner in GetAllocator function with
+        // 'create_if_not_found = ture'. We make special treatment for the
+        // default stream for performance reasons. Since most Alloc calls are
+        // for default stream in application, treating it separately can avoid
+        // lots of overhead of acquiring default stream and applying read-write
+        // lock.
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          WrapStreamSafeCUDAAllocatorForDefault();
         }
+
         InitNaiveBestFitCUDAPinnedAllocator();
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
+        }
+        InitNaiveBestFitNPUPinnedAllocator();
+#endif
 #ifdef PADDLE_WITH_XPU
         for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
@@ -295,7 +317,8 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe();
 
 #ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    if (FLAGS_use_stream_safe_cuda_allocator == false &&
+        UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
       WrapCUDAGraphAllocator();
     }
 #endif
@@ -335,7 +358,12 @@ class AllocatorFacadePrivate {
   const std::shared_ptr<Allocator>& GetAllocator(
       const platform::CUDAPlace& place, const gpuStream_t& stream,
       bool create_if_not_found = false) {
-    {  // shared_lock_guard
+    if (stream == GetDefaultStream(place)) {
+      VLOG(7) << "Get Allocator by passing in a default stream";
+      return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+    }
+
+    /* shared_lock_guard */ {
       std::shared_lock<std::shared_timed_mutex> lock_guard(
           cuda_allocator_mutex_);
       if (LIKELY(HasCUDAAllocator(place, stream))) {
@@ -349,7 +377,7 @@ class AllocatorFacadePrivate {
       }
     }
 
-    {  // unique_lock_guard
+    /* unique_lock_guard */ {
       std::unique_lock<std::shared_timed_mutex> lock_guard(
           cuda_allocator_mutex_);
       InitStreamSafeCUDAAllocator(place, stream);
@@ -357,9 +385,40 @@ class AllocatorFacadePrivate {
     }
   }
 
-  gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream();
+  const std::shared_ptr<StreamSafeCUDAAllocator>
+  GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const {
+    const auto iter = default_stream_safe_cuda_allocators_.find(place);
+    PADDLE_ENFORCE_NE(
+        iter, default_stream_safe_cuda_allocators_.end(),
+        platform::errors::NotFound(
+            "No StreamSafeCUDAAllocator found for the place, %s", place));
+    return iter->second;
+  }
+
+  const gpuStream_t& GetDefaultStream(const platform::CUDAPlace& place) const {
+    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
+        GetDefaultStreamSafeCUDAAllocator(place);
+    return allocator->GetDefaultStream();
+  }
+
+  void SetDefaultStream(const platform::CUDAPlace& place,
+                        const gpuStream_t& stream) {
+    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
+        GetDefaultStreamSafeCUDAAllocator(place);
+    allocator->SetDefaultStream(stream);
+    VLOG(8) << "Set default stream to " << stream
+            << " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "
+            << place;
+  }
+
+  void SetDefaultStreamFromDeviceContext() {
+    VLOG(8) << "Set default stream from DeviceContex";
+    for (auto& pair : default_stream_safe_cuda_allocators_) {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      pair.second->SetDefaultStream(
+          static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
+    }
   }
 
   void RecordStream(std::shared_ptr<phi::Allocation> allocation,
@@ -629,6 +688,26 @@ class AllocatorFacadePrivate {
         /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
   }
 
+  void WrapStreamSafeCUDAAllocatorForDefault() {
+    for (auto& pair : allocators_) {
+      auto& place = pair.first;
+      if (platform::is_gpu_place(place)) {
+        std::shared_ptr<StreamSafeCUDAAllocator>&& allocator =
+            std::make_shared<StreamSafeCUDAAllocator>(
+                pair.second, place, /* default_stream = */ nullptr,
+                /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
+        pair.second = allocator;
+
+        // NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an
+        // ability to interact with the outside world, i.e., change default
+        // stream from outside
+        default_stream_safe_cuda_allocators_[place] = allocator;
+        VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place
+                << ", allocator address = " << pair.second.get();
+      }
+    }
+  }
+
   void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
                               size_t retry_time) {
     PADDLE_ENFORCE_GT(
@@ -807,7 +886,6 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-  // NOTE(Ruibiao): Old single-stream version, will be removed later
   void WrapCUDARetryAllocator(size_t retry_time) {
     PADDLE_ENFORCE_GT(
         retry_time, 0,
@@ -822,6 +900,8 @@ class AllocatorFacadePrivate {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // a standalone CUDA allocator to support multi-stream GC in new executor
+  std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
+      default_stream_safe_cuda_allocators_;
   CUDAAllocatorMap cuda_allocators_;
   std::shared_timed_mutex cuda_allocator_mutex_;
 #endif
@@ -864,15 +944,6 @@ AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
 
 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
     const platform::Place& place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      FLAGS_use_system_allocator == false) {
-    AllocatorFacadePrivate* m = GetPrivate();
-    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
-  }
-#endif
-
   return GetPrivate()->GetAllocator(
       place, /* A non-zero num to choose allocator_ */ 1);
 }
@@ -892,19 +963,6 @@ void* AllocatorFacade::GetBasePtr(
   return GetPrivate()->GetBasePtr(allocation);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
-    const platform::Place& place, const gpuStream_t& stream) {
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      FLAGS_use_system_allocator == false) {
-    return GetPrivate()->GetAllocator(place, stream,
-                                      /*create_if_not_found=*/true);
-  }
-  return GetPrivate()->GetAllocator(
-      place, /* A non-zero num to choose allocator_ */ 1);
-}
-#endif
-
 const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
     const platform::Place& place) {
   return GetPrivate()->GetAllocator(place, /* zero size */ 0);
@@ -917,26 +975,10 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      size > 0 && FLAGS_use_system_allocator == false) {
-    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
-        GetPrivate()->GetDefaultStream(cuda_place)));
-    return Alloc(cuda_place, size, default_stream);
-  }
-#endif
   return GetPrivate()->GetAllocator(place, size)->Allocate(size);
 }
 
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      FLAGS_use_system_allocator == false) {
-    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
-  }
-#endif
   return GetPrivate()
       ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
       ->Release(place);
@@ -1022,6 +1064,17 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
   GetPrivate()->RecordStream(allocation, stream);
 }
 
+const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+    const platform::Place& place, const gpuStream_t& stream) {
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      FLAGS_use_system_allocator == false) {
+    return GetPrivate()->GetAllocator(place, stream,
+                                      /*create_if_not_found=*/true);
+  }
+  return GetPrivate()->GetAllocator(
+      place, /* A non-zero num to choose allocator_ */ 1);
+}
+
 const gpuStream_t& AllocatorFacade::GetStream(
     const std::shared_ptr<phi::Allocation>& allocation) const {
   PADDLE_ENFORCE_EQ(
@@ -1034,6 +1087,13 @@ const gpuStream_t& AllocatorFacade::GetStream(
   return GetPrivate()->GetStream(allocation);
 }
 
+void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
+                                       const gpuStream_t& stream) {
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    GetPrivate()->SetDefaultStream(place, stream);
+  }
+}
+
 #ifdef PADDLE_WITH_CUDA
 void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
   PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
@@ -1049,6 +1109,8 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
           "The memory pool of the CUDA Graph with ID %d have been prepared.",
           id));
   allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+  allocator->SetDefaultStreamFromDeviceContext();
+
   VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 9066bb284e28a..1ea872f7ecaf4 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -55,11 +55,6 @@ class AllocatorFacade {
 
   void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
-                                                 const gpuStream_t& stream);
-#endif
-
   const std::shared_ptr<Allocator>& GetZeroAllocator(
       const platform::Place& place);
 
@@ -86,8 +81,12 @@ class AllocatorFacade {
   uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
   void RecordStream(std::shared_ptr<Allocation> allocation,
                     const gpuStream_t& stream);
+  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
+                                                 const gpuStream_t& stream);
   const gpuStream_t& GetStream(
       const std::shared_ptr<Allocation>& allocation) const;
+  void SetDefaultStream(const platform::CUDAPlace& place,
+                        const gpuStream_t& stream);
 #endif
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 072c4dee3bc45..7e47d35176bac 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -154,6 +154,14 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
 
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
+const gpuStream_t& StreamSafeCUDAAllocator::GetDefaultStream() const {
+  return default_stream_;
+}
+
+void StreamSafeCUDAAllocator::SetDefaultStream(const gpuStream_t& stream) {
+  default_stream_ = stream;
+}
+
 phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
   platform::RecordEvent("StreamSafeCUDAAllocator::Allocate",
                         platform::TracerEventType::UserDefined, 9 /*level*/);
@@ -187,12 +195,8 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
   platform::RecordEvent("StreamSafeCUDAAllocator::Free",
                         platform::TracerEventType::UserDefined, 9 /*level*/);
   StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
-      dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
-  PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
-                          platform::errors::InvalidArgument(
-                              "Failed to dynamic cast %p from Allocation* to "
-                              "StreamSafeCUDAAllocation*",
-                              allocation));
+      static_cast<StreamSafeCUDAAllocation*>(allocation);
+
   VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
   if (stream_safe_cuda_allocation->CanBeFreed()) {
     VLOG(9) << "Directly delete allocation";
@@ -221,6 +225,12 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
 }
 
 void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
+  // NOTE(Ruibiao): This condition is to reduce lock competion. It does not need
+  // to be thread-safe since here occasional misjudgments are permissible.
+  if (unfreed_allocations_.empty()) {
+    return;
+  }
+
   std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
   for (auto it = unfreed_allocations_.begin();
        it != unfreed_allocations_.end();) {
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index ecddff97c206b..65af32c701b75 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -64,7 +64,10 @@ class StreamSafeCUDAAllocator
                           platform::CUDAPlace place, gpuStream_t default_stream,
                           bool in_cuda_graph_capturing = false);
   ~StreamSafeCUDAAllocator();
+
   bool IsAllocThreadSafe() const override;
+  const gpuStream_t &GetDefaultStream() const;
+  void SetDefaultStream(const gpuStream_t &stream);
 
  protected:
   phi::Allocation *AllocateImpl(size_t size) override;
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 66f1bcc8b6869..8f7b62a2c9d27 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1482,6 +1482,23 @@ REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor);
 REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor);
 REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor);
 REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor);
+REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor);
+REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu,
+                       ThresholdedReluFunctor, ThresholdedReluGradFunctor);
+REGISTER_ACTIVATION_OP(hard_shrink, HardShrink, HardShrinkFunctor,
+                       HardShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor,
+                       SoftShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor,
+                       TanhShrinkGradFunctor);
+REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor);
+REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
+                       HardSigmoidGradFunctor);
+REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor,
+                       LogSigmoidGradFunctor);
+REGISTER_ACTIVATION_OP(log2, Log2, Log2Functor, Log2GradFunctor);
+REGISTER_ACTIVATION_OP(log10, Log10, Log10Functor, Log10GradFunctor);
+REGISTER_ACTIVATION_OP(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
 
 /* ==========================    sigmoid register  =============================
  */
@@ -1516,30 +1533,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad,
                       ops::SigmoidTripleGradFunctor<float>::FwdDeps()>,
                   ops::ActivationTripleGradOpInplaceInferer);
 
-// Register Sigmoid/GradSigmoid Kernels
-REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
-                               SigmoidGradFunctor);
-
-// Register DoubleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_grad_grad,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<float>>,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<double>>,
-    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>);
-
-// Register TripleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_triple_grad,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<float>>,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<double>>,
-    ops::SigmoidTripleGradKernel<plat::CPUDeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>);
-
 /* ========================================================================== */
 
 /* ==========================    tanh register  ============================= */
@@ -1567,23 +1560,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpTripleGrad<ops::TanhTripleGradFunctor<float>::FwdDeps()>,
     ops::ActivationTripleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    tanh_grad_grad, ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                                              ops::TanhGradGradFunctor<float>>,
-    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::TanhGradGradFunctor<double>>,
-    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::TanhGradGradFunctor<plat::float16>>);
-// Register TripleGrad Kernel
-REGISTER_OP_CPU_KERNEL(
-    tanh_triple_grad,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<float>>,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<double>>,
-    ops::TanhTripeGradKernel<plat::CPUDeviceContext,
-                             ops::TanhTripleGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ==========================    relu register  ============================= */
@@ -1623,16 +1599,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad2<ops::LeakyReluGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
-                               LeakyReluGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    leaky_relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::LeakyReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
-                                    ops::LeakyReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<
-        plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ========================    elu  register     ============================ */
@@ -1650,22 +1616,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad<ops::ELUGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(elu,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ELUFunctor<float>>,
-                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
-                                             ops::ELUFunctor<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elu_grad, ops::ELUGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ELUGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    elu_grad_grad, ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                                            ops::ELUGradGradFunctor<float>>,
-    ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::ELUGradGradFunctor<double>>,
-    ops::ELUDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::ELUGradGradFunctor<plat::float16>>);
-
 /* ========================================================================== */
 
 /* ========================    logit  register     ============================
@@ -1920,15 +1870,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(log, Log, LogFunctor, LogGradFunctor);
-
-REGISTER_OP_CPU_KERNEL(
-    log_grad_grad, ops::LogDoubleGradKernel<plat::CPUDeviceContext,
-                                            ops::LogGradGradFunctor<float>>,
-    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::LogGradGradFunctor<double>>,
-    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ==========================  register checkpoint ===========================*/
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 4b79397b6cdf2..7db5675c16b2d 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -238,21 +238,20 @@ struct BaseActivationFunctor {
   AttrPair GetAttrs() { return AttrPair(); }
 };
 
-// sigmoid(x) = 1 / (1 + exp(-x))
-template <typename T>
-struct SigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-  }
-};
-
 #define USE_PHI_FUNCTOR(name)                         \
   template <typename T>                               \
   using name##Functor = phi::funcs::name##Functor<T>; \
   template <typename T>                               \
   using name##GradFunctor = phi::funcs::name##GradFunctor<T>;
 
+#define USE_PHI_DOUBLE_GRAD_FUNCTOR(name) \
+  template <typename T>                   \
+  using name##GradGradFunctor = phi::funcs::name##GradGradFunctor<T>;
+
+#define USE_PHI_TRIPLE_GRAD_FUNCTOR(name) \
+  template <typename T>                   \
+  using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor<T>;
+
 USE_PHI_FUNCTOR(Cos)
 USE_PHI_FUNCTOR(Tan)
 USE_PHI_FUNCTOR(Acos)
@@ -264,181 +263,32 @@ USE_PHI_FUNCTOR(Cosh)
 USE_PHI_FUNCTOR(Asinh)
 USE_PHI_FUNCTOR(Acosh)
 USE_PHI_FUNCTOR(Atanh)
-
-template <typename T>
-struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * out * (static_cast<T>(1) - out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-/*
-    Out
-    DOut -> SigmoidGradGrad -> DOutNew
-    DDX                        DDOut
-
-    DDOut = (1-Out)*Out*DDX
-    DOutNew = (1-2*Out)*DOut*DDX
-*/
-template <typename T>
-struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
-
-    if (dOutNew) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
-      auto dout_new = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad"));
-      dout_new.device(*d) =
-          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad"));
-      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-/*
-    Out
-    DOut                            D_Dout
-    DDx     -> SigmoidTripleGrad -> D_DDx
-    D_DDout                         d_OutNew
-    D_Dout_new
-
-    D_Dout = (1-2*Out)*DDx*D_Dout_new
-    D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
-    D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
-
-    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-    D_OutNew, D_DOut, D_DDx               // output
-*/
-template <typename T>
-struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  const framework::Tensor* d_DDOut,
-                  const framework::Tensor* d_dOut_New,
-                  framework::Tensor* d_d_Out, framework::Tensor* d_Out_New,
-                  framework::Tensor* d_DDx) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad"));
-    auto dout = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad"));
-    auto d_ddOut = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
-    auto d_dOutNew = framework::EigenVector<T>::Flatten(GET_DATA_SAFELY(
-        d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad"));
-
-    if (d_Out_New) {
-      auto d_OutNew = framework::EigenVector<T>::Flatten(GET_DATA_SAFELY(
-          d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad"));
-      d_OutNew.device(*d) = (ddx - static_cast<T>(2) * out * ddx) * d_ddOut -
-                            static_cast<T>(2) * dout * ddx * d_dOutNew;
-    }
-    if (d_d_Out) {
-      auto d_dOut = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad"));
-      d_dOut.device(*d) =
-          (static_cast<T>(1) - static_cast<T>(2) * out) * ddx * d_dOutNew;
-    }
-    if (d_DDx) {
-      auto d_ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad"));
-      d_ddx.device(*d) =
-          (static_cast<T>(1) - out) * out * d_ddOut +
-          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-// silu(x) = x / (1 + exp(-x))
-template <typename T>
-struct SiluFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-    out.device(d) = x * temp;
-  }
-};
-
-// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
-template <typename T>
-struct SiluGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
-    auto temp2 = x * (-x).exp();                  // x*e^(-x)
-    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
-                           (static_cast<T>(1) + (temp2 / temp1)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// Originally: logsigmoid(x) = -log (1 + exp(-x))
-// For numerical stability, we can use the log-sum-exp trick:
-// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
-// We can rewrite the above equation as:
-// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
-//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
-//           max(-x, 0)))
-//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
-//
-// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
-// + exp(-x - max(-x, 0))))
-template <typename T>
-struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
-  }
-};
-
-// Originally: f' = exp(-x) / (1 + exp(-x))
-// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
-// exp(-x - max(-x, 0)))
-template <typename T>
-struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
-    dx.device(d) =
-        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
+USE_PHI_FUNCTOR(Tanh)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh)
+USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh)
+USE_PHI_FUNCTOR(BRelu)
+USE_PHI_FUNCTOR(ThresholdedRelu)
+USE_PHI_FUNCTOR(LeakyRelu)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu)
+USE_PHI_FUNCTOR(HardShrink)
+USE_PHI_FUNCTOR(SoftShrink)
+USE_PHI_FUNCTOR(TanhShrink)
+USE_PHI_FUNCTOR(Silu)
+USE_PHI_FUNCTOR(ELU)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU)
+USE_PHI_FUNCTOR(Sigmoid)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid)
+USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid)
+USE_PHI_FUNCTOR(LogSigmoid)
+USE_PHI_FUNCTOR(HardSigmoid)
+USE_PHI_FUNCTOR(Log)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Log)
+USE_PHI_FUNCTOR(Log2)
+USE_PHI_FUNCTOR(Log10)
+USE_PHI_FUNCTOR(Log1p)
+
+template <typename T>
+using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
 
 // exp(x) = e^x
 template <typename T>
@@ -497,210 +347,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 template <typename T>
 using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
 
-// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.tanh();
-  }
-};
-
-template <typename T>
-struct TanhGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) - out * out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
-    // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
-    // * ddx)
-    if (dOutNew) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
-      auto dout_new = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad"));
-      dout_new.device(*d) =
-          static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad"));
-      ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-/*
-    Out
-    DOut                            D_Dout
-    DDx     -> TanhTripleGrad ->    D_DDx
-    D_DDout                         d_OutNew
-    D_Dout_new
-
-    D_Dout = (-2) * Out * DDx * D_Dout_new
-    D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new
-    D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new
-
-    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-    D_OutNew, D_DOut, D_DDx               // output
-*/
-template <typename T>
-struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, const framework::Tensor* dOut,
-                  const framework::Tensor* d_DDOut,
-                  const framework::Tensor* d_dOut_New,
-                  framework::Tensor* d_d_Out, framework::Tensor* d_Out_New,
-                  framework::Tensor* d_DDx) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad"));
-    auto dout = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad"));
-    auto d_ddOut = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad"));
-    auto d_dOutNew = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad"));
-
-    if (d_Out_New) {
-      auto d_OutNew = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad"));
-      d_OutNew.device(*d) = (static_cast<T>(-2) * out * ddx * d_ddOut) -
-                            (static_cast<T>(2) * dout * ddx * d_dOutNew);
-    }
-    if (d_d_Out) {
-      auto d_dOut = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad"));
-      d_dOut.device(*d) = static_cast<T>(-2) * out * ddx * d_dOutNew;
-    }
-    if (d_DDx) {
-      auto d_ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad"));
-      d_ddx.device(*d) = (static_cast<T>(1) - (out * out)) * d_ddOut -
-                         static_cast<T>(2) * out * dout * d_dOutNew;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x - x.tanh();
-  }
-};
-
-template <typename T>
-struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x.tanh() * x.tanh());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// tanhshrink(x) = x - tanh(x)
-// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <typename T>
-struct HardShrinkFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 || temp2).template cast<T>();
-  }
-};
-
-template <typename T>
-struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 = x < static_cast<T>(threshold * -1.f);
-    auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
-// otherwise
-template <typename T>
-struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
-  }
-};
-
-template <typename T>
-struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto lambdaT = static_cast<T>(lambda);
-    auto temp1 = (x > lambdaT).template cast<T>();
-    auto temp2 = (x < -lambdaT).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 // sqrt(x) = x^(1/2)
 template <typename T>
 struct SqrtFunctor : public BaseActivationFunctor<T> {
@@ -807,88 +453,6 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-// log(x) = natural logarithm of x
-template <typename T>
-struct LogFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.log();
-  }
-};
-
-template <typename T>
-struct LogGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) / x);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// log2(x) = logarithm to the base 2 of the elements of x
-template <typename T>
-struct Log2Functor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.log() / static_cast<T>(log(2));
-  }
-};
-
-// the gradient of log2(x) is 1/(x*ln(2))
-template <typename T>
-struct Log2GradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// log10(x) = logarithm to the base 10 of the elements of x
-template <typename T>
-struct Log10Functor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.log() / static_cast<T>(log(10));
-  }
-};
-
-// the gradient of log10(x) is 1/(x*ln(10))
-template <typename T>
-struct Log10GradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// log1p(x) = natural logarithm of x+1
-template <typename T>
-struct Log1pFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = (static_cast<T>(1) + x).log();
-  }
-};
-
-template <typename T>
-struct Log1pGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 // square(x) = x^2
 template <typename T>
 struct SquareFunctor : public BaseActivationFunctor<T> {
@@ -909,42 +473,6 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct BReluFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-
-  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
-  // not polymorphism for speed.
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
-  }
-};
-
-template <typename T>
-struct BReluGradFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
-                       .template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 // relu6(x) = min(max(0, x), 6)
 template <typename T>
 struct Relu6Functor : public BaseActivationFunctor<T> {
@@ -1168,94 +696,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct LeakyReluFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    if (alpha < 1.f) {
-      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
-    } else {
-      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
-    }
-  }
-};
-
-template <typename T>
-struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp1 =
-        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
-    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct ELUFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        (x < static_cast<T>(0))
-            .select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
-  }
-};
-
-template <typename T>
-struct ELUGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    // case 1: alpha >= 0
-    // dx = dout, if out > 0
-    // dx = dout * (out + alpha), if out <= 0
-    dx.device(d) = (out > static_cast<T>(0))
-                       .select(dout, dout * (out + static_cast<T>(alpha)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    // case 2: alpha < 0
-    // dx = dout, if x > 0
-    // dx = dout * (out + alpha), if x <=0
-    dx.device(d) = (x > static_cast<T>(0))
-                       .select(dout, dout * static_cast<T>(alpha) * x.exp());
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename DeviceContext, typename T>
 class ELUGradKernel : public framework::OpKernel<T> {
  public:
@@ -1423,79 +863,11 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto a = static_cast<T>(scale_a);
     auto b = static_cast<T>(scale_b);
-    auto temp = (a * x).tanh() * (a * x).tanh();
-    dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto th = static_cast<T>(threshold);
-    out.device(d) = (x > th).template cast<T>() * x;
-  }
-};
-
-template <typename T>
-struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto th = static_cast<T>(threshold);
-    dx.device(d) = dout * (x > th).template cast<T>();
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    out.device(d) =
-        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
-  }
-};
-
-template <typename T>
-struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  float slope;
-  float offset;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
-                       .template cast<T>() *
-                   static_cast<T>(slope);
+    auto temp = (a * x).tanh() * (a * x).tanh();
+    dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
   }
 
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
@@ -1531,121 +903,6 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-/*
- * in arguments: x, out, ddx
- * out arguments: ddout, dout, dx
- */
-template <ActBwdOpFwdDeps kDepValue>
-inline void ExtractActivationDoubleGradTensor(
-    const framework::ExecutionContext& ctx, const framework::Tensor** X,
-    const framework::Tensor** Out, const framework::Tensor** ddX,
-    framework::Tensor** dX, framework::Tensor** dOut,
-    framework::Tensor** ddOut) {
-  auto ddx_var = ctx.InputVar("DDX");
-  auto ddo_var = ctx.OutputVar("DDOut");
-  PADDLE_ENFORCE_NOT_NULL(
-      ddx_var, platform::errors::NotFound(
-                   "Cannot get input Variable Out, variable name = %s",
-                   ctx.InputName("DDX")));
-  if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-    *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var);
-    if (ddo_var) {
-      *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-          ddo_var);
-    }
-  } else {
-    *ddX = ctx.Input<framework::Tensor>("DDX");
-    if (ddo_var) {
-      *ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-  }
-  PADDLE_ENFORCE_NOT_NULL(
-      *ddX,
-      platform::errors::NotFound(
-          "Cannot get the tensor from the Variable Output, variable name = %s",
-          ctx.OutputName("DDX")));
-
-  if (static_cast<int>(kDepValue) & static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
-    auto x_var = ctx.InputVar("X");
-    PADDLE_ENFORCE_NOT_NULL(
-        x_var, platform::errors::NotFound(
-                   "Cannot get input Variable Out, variable name = %s",
-                   ctx.InputName("X")));
-    auto dx_var = ctx.OutputVar("DX");
-    if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-      *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
-      if (dx_var) {
-        *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-            dx_var);
-      }
-    } else {
-      *X = ctx.Input<framework::Tensor>("X");
-      if (dx_var) {
-        *dX = ctx.Output<framework::Tensor>("DX");
-      }
-    }
-  } else {
-    VLOG(10) << "Inplace activation of Op: " << ctx.Type();
-    *X = *ddX;
-  }
-  if (static_cast<int>(kDepValue) &
-      static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::NotFound(
-            "Cannot get the tensor from the Variable Out, variable name = %s",
-            ctx.InputName("Out")));
-    auto dout_var = ctx.OutputVar("DOut");
-    if (CanBeUsedBySelectedRows.count(ctx.Type())) {
-      *Out =
-          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
-      if (dout_var) {
-        *dOut =
-            paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
-                dout_var);
-      }
-    } else {
-      *Out = ctx.Input<framework::Tensor>("Out");
-      if (dout_var) {
-        *dOut = ctx.Output<framework::Tensor>("DOut");
-      }
-    }
-  } else {
-    VLOG(10) << "Inplace activation of Op: " << ctx.Type();
-    *Out = *ddX;
-  }
-}
-
-template <typename DeviceContext, typename Functor>
-class ActivationDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *Out, *ddX;
-    X = Out = ddX = nullptr;
-    framework::Tensor *ddOut, *dOut, *dX;
-    ddOut = dOut = dX = nullptr;
-
-    ExtractActivationDoubleGradTensor<Functor::FwdDeps()>(ctx, &X, &Out, &ddX,
-                                                          &dX, &dOut, &ddOut);
-
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-    if (dOut) dOut->mutable_data<T>(ctx.GetPlace());
-    if (dX) dX->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
-    functor(place, X, Out, ddX, ddOut, dOut, dX);
-  }
-};
-
 template <typename T>
 struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device>
@@ -1667,73 +924,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* Out, const framework::Tensor* ddX,
-                  framework::Tensor* ddOut, framework::Tensor* dOut,
-                  framework::Tensor* dX) const {
-    if (ddOut) {
-      auto* d = dev.eigen_device();
-      auto ddx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
-      auto x = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
-      ddout.device(*d) =
-          ddx *
-          ((x > static_cast<T>(0)).template cast<T>() +
-           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
-              .template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
-
-    if (dX) {
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
-      dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
-                      (x <= static_cast<T>(0)).template cast<T>();
-    }
-
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((x > static_cast<T>(0)).template cast<T>() +
-                          static_cast<T>(alpha) * x.exp() *
-                              (x <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
   float alpha;
@@ -1907,211 +1097,6 @@ inline void ExtractDoubleGradTensorWithInputDOut(
   }
 }
 
-template <typename DeviceContext, typename Functor>
-class SigmoidDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut;
-    framework::Tensor *dOutNew, *ddOut;
-    Out = ddX = dOut = nullptr;
-    dOutNew = ddOut = nullptr;
-    // extract ddx(input) and out(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    // set output ddout
-    ddOut = ctx.Output<framework::Tensor>("DDOut");
-    // extract dOut(intput)
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    dOutNew = ctx.Output<framework::Tensor>("DOutNew");
-    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, dOutNew, ddOut);
-  }
-};
-
-// Out, DDX, DOut, D_DDOut, D_DOut_New   // input
-// D_OutNew, D_DOut, D_DDx               // output
-template <typename DeviceContext, typename Functor>
-class SigmoidTripleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew;
-    framework::Tensor *d_OutNew, *d_dOut, *d_ddx;
-    Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr;
-    d_OutNew = d_dOut = d_ddx = nullptr;
-
-    // extract ddx(input), out(input), dOut(input), d_ddOut(input),
-    // d_dOutNew(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    d_ddOut = ctx.Input<framework::Tensor>("D_DDOut");
-    d_dOutNew = ctx.Input<framework::Tensor>("D_DOut_New");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_ddOut, platform::errors::NotFound(
-                     "Cannot get input Variable d_ddOut, variable name = %s",
-                     ctx.InputName("D_DDOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_dOutNew,
-        platform::errors::NotFound(
-            "Cannot get input Variable d_dOutNew, variable name = %s",
-            ctx.InputName("D_DOutNew")));
-
-    // set output d_OutNew、d_dOut、d_ddx
-    d_dOut = ctx.Output<framework::Tensor>("D_DOut");
-    d_OutNew = ctx.Output<framework::Tensor>("D_OutNew");
-    d_ddx = ctx.Output<framework::Tensor>("D_DDx");
-
-    if (d_dOut) d_dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_OutNew) d_OutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_ddx) d_ddx->mutable_data<T>(ddX->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew,  // input
-            d_dOut, d_OutNew, d_ddx);                   // output
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class TanhDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut;
-    framework::Tensor *dOutNew, *ddOut;
-    Out = ddX = dOut = nullptr;
-    dOutNew = ddOut = nullptr;
-
-    // extract ddx(input) and out(input)
-    auto ddx_var = ctx.InputVar("DDX");
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddx_var, platform::errors::NotFound(
-                     "Cannot get input Variable ddx, variable name = %s",
-                     ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var, platform::errors::NotFound(
-                     "Cannot get input Variable out, variable name = %s",
-                     ctx.InputName("Out")));
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-
-    // set output ddout
-    auto ddout_var = ctx.OutputVar("DDOut");
-    if (ddout_var) {
-      ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-
-    // extract dOut(intput)
-    auto dout_var = ctx.InputVar("DOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        dout_var, platform::errors::NotFound(
-                      "Cannot get input Variable dout_var, variable name = %s",
-                      ctx.InputName("DOut")));
-    dOut = ctx.Input<framework::Tensor>("DOut");
-
-    // set output dout_new
-    auto dout_new_var = ctx.OutputVar("DOutNew");
-    if (dout_new_var) {
-      dOutNew = ctx.Output<framework::Tensor>("DOutNew");
-    }
-
-    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, dOutNew, ddOut);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class TanhTripeGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew;
-    framework::Tensor *d_OutNew, *d_dOut, *d_ddx;
-    Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr;
-    d_OutNew = d_dOut = d_ddx = nullptr;
-
-    // extract ddx(input), out(input), dOut(input), d_ddOut(input),
-    // d_dOutNew(input)
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    Out = ctx.Input<framework::Tensor>("Out");
-    dOut = ctx.Input<framework::Tensor>("DOut");
-    d_ddOut = ctx.Input<framework::Tensor>("D_DDOut");
-    d_dOutNew = ctx.Input<framework::Tensor>("D_DOut_New");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable ddX, variable name = %s",
-                 ctx.InputName("DDX")));
-    PADDLE_ENFORCE_NOT_NULL(
-        Out, platform::errors::NotFound(
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.InputName("Out")));
-    PADDLE_ENFORCE_NOT_NULL(
-        dOut, platform::errors::NotFound(
-                  "Cannot get input Variable dOut, variable name = %s",
-                  ctx.InputName("DOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_ddOut, platform::errors::NotFound(
-                     "Cannot get input Variable d_ddOut, variable name = %s",
-                     ctx.InputName("D_DDOut")));
-    PADDLE_ENFORCE_NOT_NULL(
-        d_dOutNew,
-        platform::errors::NotFound(
-            "Cannot get input Variable d_dOutNew, variable name = %s",
-            ctx.InputName("D_DOutNew")));
-
-    // set output d_OutNew、d_dOut、d_ddx
-    d_dOut = ctx.Output<framework::Tensor>("D_DOut");
-    d_OutNew = ctx.Output<framework::Tensor>("D_OutNew");
-    d_ddx = ctx.Output<framework::Tensor>("D_DDx");
-
-    if (d_dOut) d_dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_OutNew) d_OutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (d_ddx) d_ddx->mutable_data<T>(ddX->dims(), ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew,  // input
-            d_dOut, d_OutNew, d_ddx);                   // output
-  }
-};
-
 template <typename DeviceContext, typename Functor>
 class SquareDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -2135,37 +1120,6 @@ class SquareDoubleGradKernel
   }
 };
 
-template <typename DeviceContext, typename Functor>
-class LogDoubleGradKernel
-    : public SquareDoubleGradKernel<DeviceContext, Functor> {};
-
-template <typename DeviceContext, typename Functor>
-class ELUDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *ddX, *dOut;
-    X = ddX = dOut = nullptr;
-    framework::Tensor *dX, *ddOut;
-    dX = ddOut = nullptr;
-
-    ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut);
-
-    if (dX) dX->mutable_data<T>(X->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
-    functor(place, X, ddX, ddOut, dOut, dX);
-  }
-};
-
 template <typename DeviceContext, typename Functor>
 class CELUDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -2460,62 +1414,19 @@ class LogitGradKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
-struct LogGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
-    // ddout = ddx / x; dx = -(dout / x) * (ddx / x)
-    // calculate dx first, so ddout can inplace ddx
-    if (dX) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
-      dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
-      ddout.device(*d) = ddx * static_cast<T>(1) / x;
-    }
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_EACH_ACTIVATION_OP(__macro)                                       \
-  __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
-  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
-  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
-  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                       \
-  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
-  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
-  __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
-  __macro(log2, Log2, Log2Functor, Log2GradFunctor);                          \
-  __macro(log10, Log10, Log10Functor, Log10GradFunctor);                      \
-  __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
-  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
-  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                      \
-  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);          \
-  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);          \
-  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                      \
-  __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
-  __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \
-  __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                      \
-          HardSigmoidGradFunctor);                                            \
-  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                      \
-  __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor,          \
-          ThresholdedReluGradFunctor);                                        \
-  __macro(mish, Mish, MishFunctor, MishGradFunctor);                          \
+#define FOR_EACH_ACTIVATION_OP(__macro)                                      \
+  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                         \
+  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                      \
+  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                      \
+  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
+  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);        \
+  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                     \
+  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);         \
+  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);         \
+  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                     \
+  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                     \
+  __macro(mish, Mish, MishFunctor, MishGradFunctor);                         \
   __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index 92a101451e211..bb08cee5bcde9 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -15,170 +15,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // leakyrelu(x) = x > 0 ? x : alpha * x
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > zero ? x : static_cast<T>(alpha) * x;
-  }
-};
-
-template <typename T>
-struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // dx = dout * (x > 0 ? 1 : alpha)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return x > zero ? dout : static_cast<T>(alpha) * dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // sigmoid(x) = 1 / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(one / (one + exp(-x)));
-  }
-};
-
-template <typename T>
-struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout * out * (1 - out)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * out * (one - out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct CudaSiluFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // silu(x) = x / (1 + exp(-x))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x / (one + exp(-x)));
-  }
-};
-
-template <typename T>
-struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp = one / (one + exp(-x));
-    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-
-  // logsigmoid(x) = log(1 / (1 + exp(-x)))
-  // For numerical stability,
-  // logsigmoid(x) =
-  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp = x > zero ? zero : -x;
-    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
-  }
-};
-
-template <typename T>
-struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-
-  // dx = dout * exp(-x) / (1 + exp(-x))
-  // For numerical stability:
-  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
-  // 0)))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType temp1 = x > zero ? zero : -x;
-    MPType temp2 = exp(-x - temp1);
-    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
-  float lambda;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-
-  // softshrink(x) = x - lambda, if x > lambda;
-  //                 x + lambda, if x < -lambda;
-  //                 0, otherwise.
-  __device__ __forceinline__ T operator()(const T x) const {
-    T l = static_cast<T>(lambda);
-    T temp1 = static_cast<T>(x > l);
-    T temp2 = static_cast<T>(x < -l);
-    return temp1 * (x - l) + temp2 * (x + l);
-  }
-};
-
-template <typename T>
-struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float lambda;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"lambda", &lambda}};
-  }
-
-  // dx = dout, if x > lambda or x < -lambda else 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T l = static_cast<T>(lambda);
-    return (x >= -l && x <= l) ? zero : dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaCeilFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -224,31 +65,6 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaTanhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // tanh(x) = tanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(tanh(x));
-  }
-};
-
-template <typename T>
-struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout * (1 - out^2)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return dout * (one - out * out);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
@@ -315,27 +131,6 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaLogFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // log(x) = log(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log(x));
-  }
-};
-
-template <typename T>
-struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
-  // dx = dout / x
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / x;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaSquareFunctor : public BaseActivationFunctor<T> {
   // square(x) = x * x
@@ -404,117 +199,6 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // log1p(x) = log(1 + x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log(one + x));
-  }
-};
-
-template <typename T>
-struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout / (1 + x)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (one + x);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaLog2Functor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // log2(x) = log2(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log2(x));
-  }
-};
-
-template <typename T>
-struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
-
-  // dx = dout / (x * log(2))
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (x * log_two);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaLog10Functor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // log10(x) = log10(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log10(x));
-  }
-};
-
-template <typename T>
-struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
-
-  // dx = dout / (x * log(10))
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (x * log_ten);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaBReluFunctor : public BaseActivationFunctor<T> {
-  float t_min;
-  float t_max;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  // brelu(x) = min(max(x, t_min), t_max)
-  __device__ __forceinline__ T operator()(const T x) const {
-    T t_min_cast = static_cast<T>(t_min);
-    T t_max_cast = static_cast<T>(t_max);
-    T temp_max = x > t_min_cast ? x : t_min_cast;
-    T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
-    return temp_min;
-  }
-};
-
-template <typename T>
-struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float t_min;
-  float t_max;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"t_min", &t_min}, {"t_max", &t_max}};
-  }
-
-  // dx = (x > t_min && x < t_max) ? dout : 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T t_min_cast = static_cast<T>(t_min);
-    T t_max_cast = static_cast<T>(t_max);
-    return (x > t_min_cast && x < t_max_cast) ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -711,109 +395,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // tanhshrink(x) = x - tanh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x - tanh(x));
-  }
-};
-
-template <typename T>
-struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // dx = dout * tanh(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * tanh(x) * tanh(x));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
-  __device__ __forceinline__ T operator()(const T x) const {
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : x;
-  }
-};
-
-template <typename T>
-struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // dx = (x > -threshold && x < threshold) ? 0 : dout
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    T t = static_cast<T>(threshold);
-    return (x > -t && x < t) ? zero : dout;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  // hard_sigmoid(x) = 0, when x <= -3
-  //                   1, when x >= 3
-  //                   x * slope + offset, otherwise
-  __device__ __forceinline__ T operator()(const T x) const {
-    T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
-    T temp_max = temp > zero ? temp : zero;
-    T temp_min = temp_max < one ? temp_max : one;
-    return temp_min;
-  }
-};
-
-template <typename T>
-struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  T one = static_cast<T>(1.0f);
-  float slope;
-  float offset;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"slope", &slope}, {"offset", &offset}};
-  }
-
-  // dx = (out > 0 && out < 1) ? dout * slope : 0
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 template <typename T>
 struct CudaSwishFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -907,38 +488,6 @@ struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // thresholded_relu(x) = x > threshold ? x : 0
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > static_cast<T>(threshold) ? x : zero;
-  }
-};
-
-template <typename T>
-struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-  float threshold;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
-
-  // dx = x > threshold ? dout : 0
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return x > static_cast<T>(threshold) ? dout : zero;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaHardSwishFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
@@ -991,110 +540,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct CudaELUFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT zero = static_cast<CT>(0.0f);
-  CT one = static_cast<CT>(1.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // elu(x) = x, if x > 0
-  // elu(x) = alpha * (e^x - 1), if x <= 0
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    CT x = static_cast<CT>(arg_x);
-    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
-    CT res = x > zero ? x : temp;
-    return static_cast<T>(res);
-  }
-};
-
-template <typename T>
-struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // case 1: alpha >= 0
-  // dx = dout, if out > 0
-  // dx = dout * (out + alpha), if out <= 0
-  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType out = static_cast<MPType>(arg_out);
-    MPType a = static_cast<MPType>(alpha);
-    MPType out_pos = static_cast<MPType>(out > zero);
-    MPType out_neg = static_cast<MPType>(out <= zero);
-    return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // case 2: alpha < 0
-  // dx = dout, if x > 0
-  // dx = dout * (out + alpha), if x <=0
-  __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType out = static_cast<MPType>(arg_out);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType a = static_cast<MPType>(alpha);
-    MPType x_pos = static_cast<MPType>(x > zero);
-    MPType x_neg = static_cast<MPType>(x <= zero);
-    return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename DeviceContext, typename T>
-class ELUGradCudaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<framework::Tensor>("Out");
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-    const float alpha = ctx.Attr<float>("alpha");
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    std::vector<const framework::Tensor*> ins = {d_out, out};
-    std::vector<framework::Tensor*> outs = {d_x};
-    if (alpha > 0) {
-      CudaELUGradFunctor<T> functor;
-      functor.alpha = alpha;
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    } else {
-      CudaELUGradNegativeAlphaFunctor<T> functor;
-      functor.alpha = alpha;
-      ins.push_back(x);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
-    }
-  }
-};
-
 template <typename T>
 struct CudaCELUFunctor : public BaseActivationFunctor<T> {
   using CT = typename details::MPTypeTrait<T>::Type;
@@ -1212,6 +657,38 @@ class ActivationGradCudaKernel
   }
 };
 
+USE_PHI_FUNCTOR(CudaCos)
+USE_PHI_FUNCTOR(CudaTan)
+USE_PHI_FUNCTOR(CudaAcos)
+USE_PHI_FUNCTOR(CudaSin)
+USE_PHI_FUNCTOR(CudaAsin)
+USE_PHI_FUNCTOR(CudaAtan)
+USE_PHI_FUNCTOR(CudaSinh)
+USE_PHI_FUNCTOR(CudaCosh)
+USE_PHI_FUNCTOR(CudaAsinh)
+USE_PHI_FUNCTOR(CudaAcosh)
+USE_PHI_FUNCTOR(CudaAtanh)
+USE_PHI_FUNCTOR(CudaTanh)
+USE_PHI_FUNCTOR(CudaBRelu)
+USE_PHI_FUNCTOR(CudaLeakyRelu)
+USE_PHI_FUNCTOR(CudaThresholdedRelu)
+USE_PHI_FUNCTOR(CudaHardShrink)
+USE_PHI_FUNCTOR(CudaSoftShrink)
+USE_PHI_FUNCTOR(CudaTanhShrink)
+USE_PHI_FUNCTOR(CudaSilu)
+USE_PHI_FUNCTOR(CudaELU)
+USE_PHI_FUNCTOR(CudaSigmoid)
+USE_PHI_FUNCTOR(CudaLogSigmoid)
+USE_PHI_FUNCTOR(CudaHardSigmoid)
+USE_PHI_FUNCTOR(CudaLog)
+USE_PHI_FUNCTOR(CudaLog2)
+USE_PHI_FUNCTOR(CudaLog10)
+USE_PHI_FUNCTOR(CudaLog1p)
+
+template <typename T>
+using CudaELUGradNegativeAlphaFunctor =
+    phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -1270,40 +747,6 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<plat::bfloat16>>);
 
-/* ======================== leaky relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
-                                CudaLeakyReluGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    leaky_relu_grad_grad,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::LeakyReluGradGradFunctor<float>>,
-    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
-                                    ops::LeakyReluGradGradFunctor<double>>,
-    ops::ActivationDoubleGradKernel<
-        plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ======================== elu register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    elu, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                                   ops::CudaELUFunctor<float>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaELUFunctor<double>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
-                              ops::CudaELUFunctor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    elu_grad, ops::ELUGradCudaKernel<plat::CUDADeviceContext, float>,
-    ops::ELUGradCudaKernel<plat::CUDADeviceContext, double>,
-    ops::ELUGradCudaKernel<plat::CUDADeviceContext, plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(
-    elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                                            ops::ELUGradGradFunctor<float>>,
-    ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::ELUGradGradFunctor<double>>,
-    ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::ELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ======================== celu register  ============================ */
@@ -1319,58 +762,6 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::CELUGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ===========================    sigmoid register  ============================
- */
-REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
-                                CudaSigmoidGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_grad_grad,
-    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<float>>,
-    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<double>>,
-    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::float16>>,
-    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidGradGradFunctor<plat::bfloat16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    sigmoid_triple_grad,
-    ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<float>>,
-    ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<double>>,
-    ops::SigmoidTripleGradKernel<plat::CUDADeviceContext,
-                                 ops::SigmoidTripleGradFunctor<plat::float16>>,
-    ops::SigmoidTripleGradKernel<
-        plat::CUDADeviceContext,
-        ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
-/* ========================================================================== */
-
-/* ===========================    tanh register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
-                                CudaTanhGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    tanh_grad_grad,
-    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<float>>,
-    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<double>>,
-    ops::TanhDoubleGradKernel<plat::CUDADeviceContext,
-                              ops::TanhGradGradFunctor<plat::float16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    tanh_triple_grad,
-    ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<float>>,
-    ops::TanhTripeGradKernel<paddle::platform::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<double>>,
-    ops::TanhTripeGradKernel<plat::CUDADeviceContext,
-                             ops::TanhTripleGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
 /* ===========================   sqrt register  ============================= */
 REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
                                 CudaSqrtGradFunctor);
@@ -1495,22 +886,7 @@ REGISTER_OP_CUDA_KERNEL(
                                   ops::CudaExpm1GradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ==========================  Log register ==================================*/
-REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
-                                            ops::LogGradGradFunctor<float>>,
-    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::LogGradGradFunctor<double>>,
-    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::LogGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
-  __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
-  __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
-          CudaLogSigmoidGradFunctor);                                         \
   __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
           CudaSoftShrinkGradFunctor);                                         \
   __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor);                  \
@@ -1518,10 +894,6 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor);               \
   __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
           CudaReciprocalGradFunctor);                                         \
-  __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor);              \
-  __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor);                  \
-  __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor);              \
-  __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor);              \
   __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
   __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor);              \
   __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor);  \
@@ -1531,74 +903,228 @@ REGISTER_OP_CUDA_KERNEL(
           CudaTanhShrinkGradFunctor);                                         \
   __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor,                     \
           CudaHardShrinkGradFunctor);                                         \
-  __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor,                  \
-          CudaHardSigmoidGradFunctor);                                        \
   __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor);              \
   __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor);                  \
-  __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor,      \
-          CudaThresholdedReluGradFunctor);                                    \
   __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
           CudaHardSwishGradFunctor);
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
 
 #ifdef PADDLE_WITH_XPU_KP
-#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor,             \
-                                       grad_functor)                           \
-  REGISTER_OP_KERNEL(                                                          \
-      act_type, KP, plat::XPUPlace,                                            \
-      ops::ActivationCudaKernel<plat::XPUDeviceContext, ops::functor<float>>); \
-  REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace,                      \
-                     ops::ActivationGradCudaKernel<plat::XPUDeviceContext,     \
-                                                   ops::grad_functor<float>>);
-
-REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
-                               CudaLeakyReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
-                               CudaSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
-                               CudaReciprocalGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
-                               CudaSoftplusGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor,
-                               CudaHardSwishGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor,
-                               CudaCELUGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
-                               CudaSqrtGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor,
-                               CudaSquareGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor,
-                               CudaSiluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
-                               CudaLogSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
-                               CudaSoftShrinkGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor,
-                               CudaZeroGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor,
-                               CudaZeroGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor,
-                               CudaLog1pGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor,
-                               CudaBReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
-                               CudaSoftReluGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
-                               CudaSoftsignGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor,
-                               CudaRelu6GradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
-                               CudaHardShrinkGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid,
-                               CudaHardSigmoidFunctor,
-                               CudaHardSigmoidGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor,
-                               CudaSwishGradFunctor);
-REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu,
-                               CudaThresholdedReluFunctor,
-                               CudaThresholdedReluGradFunctor);
+REGISTER_OP_KERNEL(
+    brelu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaBReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    brelu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaBReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(ceil, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaCeilFunctor<float>>);
+REGISTER_OP_KERNEL(
+    ceil_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaZeroGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(celu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaCELUFunctor<float>>);
+REGISTER_OP_KERNEL(
+    celu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaCELUGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaELUFunctor<float>>);
+REGISTER_OP_KERNEL(
+    elu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaELUGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(exp, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaExpFunctor<float>>);
+REGISTER_OP_KERNEL(
+    exp_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaExpGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(floor, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaFloorFunctor<float>>);
+REGISTER_OP_KERNEL(
+    floor_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaZeroGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    hard_shrink, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaHardShrinkFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_shrink_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardShrinkGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    hard_sigmoid, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaHardSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_sigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(hard_swish, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaHardSwishFunctor<float>>);
+REGISTER_OP_KERNEL(
+    hard_swish_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaHardSwishGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    leaky_relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaLeakyReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    leaky_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaLeakyReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(log, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaLogFunctor<float>>);
+REGISTER_OP_KERNEL(
+    log_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLogGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(log1p, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaLog1pFunctor<float>>);
+REGISTER_OP_KERNEL(
+    log1p_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLog1pGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    logsigmoid, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaLogSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    logsigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaLogSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    reciprocal, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaReciprocalFunctor<float>>);
+REGISTER_OP_KERNEL(
+    reciprocal_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaReciprocalGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  phi::funcs::CudaReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(relu6, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaRelu6Functor<float>>);
+REGISTER_OP_KERNEL(
+    relu6_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaRelu6GradFunctor<float>>);
+
+REGISTER_OP_KERNEL(sigmoid, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSigmoidFunctor<float>>);
+REGISTER_OP_KERNEL(
+    sigmoid_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSigmoidGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(silu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSiluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    silu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSiluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(soft_relu, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    soft_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftReluGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(softplus, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftplusFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softplus_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftplusGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    softshrink, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaSoftShrinkFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softshrink_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftShrinkGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(softsign, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSoftsignFunctor<float>>);
+REGISTER_OP_KERNEL(
+    softsign_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSoftsignGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(sqrt, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSqrtFunctor<float>>);
+REGISTER_OP_KERNEL(
+    sqrt_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSqrtGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(square, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSquareFunctor<float>>);
+REGISTER_OP_KERNEL(
+    square_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSquareGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(swish, KP, plat::XPUPlace,
+                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                                             ops::CudaSwishFunctor<float>>);
+REGISTER_OP_KERNEL(
+    swish_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaSwishGradFunctor<float>>);
+
+REGISTER_OP_KERNEL(
+    thresholded_relu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              ops::CudaThresholdedReluFunctor<float>>);
+REGISTER_OP_KERNEL(
+    thresholded_relu_grad, KP, plat::XPUPlace,
+    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
+                                  ops::CudaThresholdedReluGradFunctor<float>>);
 
 #endif  // PADDLE_WITH_XPU_KP
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
new file mode 100644
index 0000000000000..237cfcc6f1172
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+
+    found_inf->mutable_data<bool>(dev_ctx.GetPlace());
+
+    MLUCnnlTensorDesc scale_desc(*scale);
+    MLUCnnlTensorDesc found_inf_desc(*found_inf, CNNL_LAYOUT_ARRAY,
+                                     ToCnnlDataType<bool>());
+
+    for (size_t i = 0; i < xs.size(); ++i) {
+      const auto* x = xs[i];
+      auto* out = outs[i];
+      out->mutable_data<T>(ctx.GetPlace());
+
+      // check is_finite or is_nan
+      Tensor is_finite(found_inf->type());
+      if (i != 0) {
+        is_finite.Resize(phi::make_ddim({1}));
+        is_finite.mutable_data<bool>(ctx.GetPlace());
+      } else {
+        is_finite.ShareDataWith(*found_inf);
+      }
+
+      MLUCnnlTensorDesc x_desc(*x);
+
+      MLUCnnl::IsNanInf(ctx, x_desc.get(), GetBasePtr(x),
+                        GetBasePtr(&is_finite));
+
+      // save is_finite by logical_and op after checking every input
+      if (i != 0) {
+        MLUCnnlTensorDesc is_finite_desc(is_finite, CNNL_LAYOUT_ARRAY,
+                                         ToCnnlDataType<bool>());
+        MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_OR, found_inf_desc.get(),
+                       GetBasePtr(found_inf), is_finite_desc.get(),
+                       GetBasePtr(&is_finite), found_inf_desc.get(),
+                       GetBasePtr(found_inf));
+      }
+
+      // The normal logic is :
+      // out = in, if found_inf = true
+      // out = in/scale, if found_inf = false
+      // But when found_inf is true, the data of Out should not be used.
+      // So, on MLU, we always compute out with in/scale.
+      MLUCnnlTensorDesc out_desc(*out);
+      MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(),
+                   GetBasePtr(x), scale_desc.get(), GetBasePtr(scale),
+                   out_desc.get(), GetBasePtr(out));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(check_finite_and_unscale,
+                       ops::CheckFiniteAndUnscaleMLUKernel<float>,
+                       ops::CheckFiniteAndUnscaleMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index 684ac5bafd0ef..ea6614cbfbdf8 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -36,26 +39,6 @@ class AssignOp : public framework::OperatorWithKernel {
            const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->HasInput("X")) {
-      auto type = ctx->GetInputsVarType("X")[0];
-      if (type == framework::proto::VarType::SELECTED_ROWS ||
-          type == framework::proto::VarType::LOD_TENSOR) {
-        ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-        if (type == framework::proto::VarType::LOD_TENSOR) {
-          ctx->ShareLoD("X", /*->*/ "Out");
-        }
-      } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
-        if (ctx->IsRuntime()) {
-          // The runtime output shape is determined in kernel.
-          return;
-        } else {
-          ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-        }
-      }
-    }
-  }
-
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
@@ -91,24 +74,6 @@ class AssignInferVarType : public framework::VarTypeInference {
   }
 };
 
-class AssignKernel {
- public:
-  void operator()(const framework::ExecutionContext &ctx) const {
-    auto *x = ctx.InputVar("X");
-    if (x == nullptr) {
-      return;
-    }
-    PADDLE_ENFORCE_EQ(
-        ctx.HasOutput("Out"), true,
-        platform::errors::NotFound("Output(Out) of assign_op is not found."));
-    auto *out = ctx.OutputVar("Out");
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(ctx.GetPlace());
-
-    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
-  }
-};
-
 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -147,23 +112,11 @@ DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
+DECLARE_INFER_SHAPE_FUNCTOR(assign, AssignInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(assign, ops::AssignOp,
                   ops::AssignGradMaker<paddle::framework::OpDesc>,
                   ops::AssignGradMaker<paddle::imperative::OpBase>,
                   ops::AssignOpProtoMaker, ops::AssignOpInplaceInferer,
-                  ops::AssignInferVarType);
-
-REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                               ops::AssignKernel, int, ops::AssignKernel,
-                               int64_t, ops::AssignKernel, uint8_t,
-                               ops::AssignKernel, bool, ops::AssignKernel,
-                               plat::float16, ops::AssignKernel, plat::bfloat16,
-                               ops::AssignKernel);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
-                                ops::AssignKernel, int, ops::AssignKernel,
-                                int64_t, ops::AssignKernel, uint8_t,
-                                ops::AssignKernel, bool, ops::AssignKernel,
-                                plat::float16, ops::AssignKernel);
-#endif
+                  ops::AssignInferVarType, AssignInferShapeFunctor);
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index b452dea8536dd..b91eb50646fec 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(assign);
+USE_OP_ITSELF(assign);
 USE_OP_DEVICE_KERNEL(assign, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 174207deb08b8..5194c8772e47b 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -297,184 +300,6 @@ The required data format for this layer is one of the following:
 )DOC");
 }
 
-template <typename T>
-class BatchNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    bool test_mode = is_test && (!trainable_stats);
-
-    bool global_stats = test_mode || use_global_stats;
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        x_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be larger than 1."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        x_dims.size(), 5,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be less than 6."
-            "But received: the size of input X's dimensionss is [%d]",
-            x_dims.size()));
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    auto *y = ctx.Output<Tensor>("Y");
-
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-
-    // alloc memory
-    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
-
-    // input dimension is 2 and the format is NCHW. The input can be regarded
-    // as NHWC format
-    if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
-      data_layout = DataLayout::kNHWC;
-    }
-
-    if (!global_stats) {
-      // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(
-          saved_mean->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> saved_variance_e(
-          saved_variance->mutable_data<T>(ctx.GetPlace()), C);
-      saved_mean_e.setZero();
-      saved_variance_e.setZero();
-
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), C);
-
-      if ((N * sample_size) == 1) {
-        // Only 1 element in normalization dimension,
-        // we skip the batch norm calculation, let y = x.
-        framework::TensorCopy(*x, ctx.GetPlace(), y);
-        return;
-      }
-
-      switch (data_layout) {
-        case DataLayout::kNCHW: {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_mean_e(nc % C) += x_arr.col(nc).sum();
-          }
-          saved_mean_e /= N * sample_size;
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_variance_e(nc % C) +=
-                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
-          }
-          saved_variance_e /= N * sample_size;
-          break;
-        }
-        case DataLayout::kNHWC: {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
-          for (int i = 0; i < N * sample_size; ++i) {
-            saved_mean_e += x_arr.col(i);
-          }
-          saved_mean_e /= N * sample_size;
-          for (int i = 0; i < N * sample_size; ++i) {
-            saved_variance_e +=
-                (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
-          }
-          saved_variance_e /= N * sample_size;
-          break;
-        }
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Unknown storage order: %s", data_layout_str));
-      }
-
-      // if MomentumTensor is set, use MomentumTensor value, momentum
-      // is only used in this training branch
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
-        momentum = mom_tensor->data<float>()[0];
-      }
-
-      running_mean_arr =
-          running_mean_arr * momentum + saved_mean_e * (1. - momentum);
-      running_var_arr =
-          running_var_arr * momentum + saved_variance_e * (1. - momentum);
-    }
-
-    // use SavedMean and SavedVariance to do normalize
-    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-    if (global_stats) {
-      ConstEigenVectorArrayMap<T> var_arr(
-          ctx.Input<Tensor>("Variance")->data<T>(), C);
-      inv_std = (var_arr + epsilon).sqrt().inverse();
-    } else {
-      EigenVectorArrayMap<T> saved_inv_std(
-          ctx.Output<Tensor>("SavedVariance")->data<T>(), C);
-      // inverse SavedVariance first, gradient will use it too.
-      saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
-      inv_std = saved_inv_std;
-    }
-    ConstEigenVectorArrayMap<T> mean_arr(
-        global_stats ? ctx.Input<Tensor>("Mean")->data<T>()
-                     : ctx.Output<Tensor>("SavedMean")->data<T>(),
-        C);
-
-    //   ((x - est_mean) * (inv_var) * scale + bias
-    //   formula transform ====>
-    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
-    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
-    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
-        bias_arr - mean_arr * inv_std * scale_arr;
-
-    switch (data_layout) {
-      case DataLayout::kNCHW: {
-        EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
-                               N * C);
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-        for (int nc = 0; nc < N * C; ++nc) {
-          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
-        }
-        break;
-      }
-      case DataLayout::kNHWC: {
-        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
-                         N * sample_size) =
-            (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
-             new_scale)
-                .colwise() +
-            new_bias;
-        break;
-      }
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unknown storage order: %d", data_layout));
-    }
-  }
-};
-
 void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
   // check input
   OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNormGrad");
@@ -585,261 +410,6 @@ framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar(
                                  tensor.place(), tensor.layout());
 }
 
-template <typename T>
-class BatchNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    use_global_stats = is_test || use_global_stats;
-
-    // batch_norm with inplace as false will take X as grad input, which
-    // is same as cuDNN batch_norm backward calculation, batch_norm
-    // with inplace as true only take Y as input and X should be calculate
-    // by inverse operation of batch_norm on Y
-    const Tensor *x;
-    bool is_inplace;
-    if (ctx.HasInput("Y")) {
-      x = ctx.Input<Tensor>("Y");
-      is_inplace = true;
-      // if the input of batch norm is stop_gradient, d_x is null.
-      if (d_x) {
-        PADDLE_ENFORCE_EQ(d_x, d_y,
-                          platform::errors::InvalidArgument(
-                              "X@GRAD and Y@GRAD not inplace in inplace mode"));
-      }
-    } else {
-      x = ctx.Input<Tensor>("X");
-      is_inplace = false;
-      if (d_x) {
-        PADDLE_ENFORCE_NE(
-            d_x, d_y, platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
-      }
-    }
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(
-        x_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be larger than 1."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    PADDLE_ENFORCE_LE(
-        x_dims.size(), 5,
-        platform::errors::InvalidArgument(
-            "The size of input X's dimensions should be less than 6."
-            "But received: the size of input X's dimensions is [%d]",
-            x_dims.size()));
-    const int N = x_dims[0];
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = x->numel() / N / C;
-
-    // input dimension is 2 and the format is NCHW. The input can be regarded as
-    // NHWC format
-    if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
-      data_layout = DataLayout::kNHWC;
-    }
-
-    // init output
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-    }
-
-    const T *mean_data = saved_mean->data<T>();
-    const T *inv_var_data = saved_inv_variance->data<T>();
-    Tensor inv_var_tensor;
-    if (use_global_stats) {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_variance = ctx.Input<Tensor>("Variance");
-      mean_data = running_mean->data<T>();
-      inv_var_tensor.Resize({C});
-      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
-      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
-      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
-
-      inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
-      inv_var_data = running_inv_var_data;
-    }
-
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
-    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
-
-    T *d_bias_data = nullptr;
-    T *d_scale_data = nullptr;
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      d_bias_data = d_bias->mutable_data<T>(ctx.GetPlace());
-      d_scale_data = d_scale->mutable_data<T>(ctx.GetPlace());
-    }
-
-    // d_bias = np.sum(d_y, axis=0)
-    // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
-    // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
-    //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
-    EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
-    EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
-
-    if (d_scale && d_bias) {
-      d_bias_arr.setZero();
-      d_scale_arr.setZero();
-    }
-
-    if (d_x && (N * sample_size) == 1 && !use_global_stats) {
-      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-      return;
-    }
-
-    int scale_coefff = use_global_stats ? 1 : N * sample_size;
-    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
-
-    Tensor dy_sum;
-    dy_sum.Resize({C});
-    dy_sum.mutable_data<T>(ctx.GetPlace());
-    EigenVectorArrayMap<T> dy_sum_arr(dy_sum.mutable_data<T>(ctx.GetPlace()),
-                                      C);
-
-    Tensor dy_mul_x_sub_mean_mul_invstd_sum;
-    dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
-    dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>(ctx.GetPlace());
-    EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
-        dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>(ctx.GetPlace()), C);
-
-    dy_sum_arr.setZero();
-    dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
-
-    // inplace calculation
-    // Y:  ((x - est_mean) * (inv_var) * scale + bias
-    //   formula transform ====>
-    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-    // X: (y - bias) / scale / (inv_var) + est_mean
-    //   formula transform ====>
-    //    (y - bias) / (scale * inv_var) + est_mean
-    switch (data_layout) {
-      case DataLayout::kNCHW: {
-        if (is_inplace) {
-          auto px = *x;
-          EigenArrayMap<T> x_data(px.mutable_data<T>(ctx.GetPlace()),
-                                  sample_size, N * C);
-          ConstEigenArrayMap<T> y_data(x->data<T>(), sample_size, N * C);
-          for (int nc = 0; nc < N * C; ++nc) {
-            x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) /
-                                 scale_inv_var_nhw(nc % C) / scale_coefff +
-                             mean_arr(nc % C);
-          }
-        }
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
-
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          dy_sum_arr(c) += d_y_arr.col(nc).sum();
-          dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
-              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
-                  .sum();
-        }
-
-        if (d_scale && d_bias) {
-          d_bias_arr = dy_sum_arr;
-          d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
-        }
-
-        if (d_x) {
-          EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
-                                   sample_size, N * C);
-          if (!use_global_stats) {
-            for (int nc = 0; nc < N * C; ++nc) {
-              int c = nc % C;
-              d_x_arr.col(nc) =
-                  scale_inv_var_nhw(c) *
-                  (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
-                   (x_arr.col(nc) - mean_arr[c]) *
-                       dy_mul_x_sub_mean_mul_invstd_sum_arr(c) *
-                       inv_var_arr(c));
-            }
-          } else {
-            for (int nc = 0; nc < N * C; ++nc) {
-              int c = nc % C;
-              d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc);
-            }
-          }
-        }
-        break;
-      }
-      case DataLayout::kNHWC: {
-        if (is_inplace) {
-          auto px = *x;
-          EigenArrayMap<T> x_data(px.mutable_data<T>(ctx.GetPlace()), C,
-                                  N * sample_size);
-          ConstEigenArrayMap<T> y_data(x->data<T>(), C, N * sample_size);
-          for (int nhw = 0; nhw < N * sample_size; nhw++) {
-            x_data.col(nhw) = (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw /
-                                  scale_coefff +
-                              mean_arr;
-          }
-        }
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
-        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
-
-        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-          dy_sum_arr += d_y_arr.col(nhw);
-          dy_mul_x_sub_mean_mul_invstd_sum_arr +=
-              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
-        }
-
-        if (d_scale && d_bias) {
-          d_bias_arr = dy_sum_arr;
-          d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
-        }
-
-        if (d_x) {
-          EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
-                                   N * sample_size);
-          if (!use_global_stats) {
-            for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-              d_x_arr.col(nhw) =
-                  scale_inv_var_nhw *
-                  (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
-                   (x_arr.col(nhw) - mean_arr) *
-                       dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
-            }
-          } else {
-            for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-              d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw);
-            }
-          }
-        }
-        break;
-      }
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unknown storage order: %s", data_layout_str));
-    }
-  }
-};
-
 template <typename T>
 void BatchNormGradMaker<T>::Apply(GradOpPtr<T> op) const {
   op->SetType(this->ForwardOpType() + "_grad");
@@ -951,335 +521,16 @@ framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType(
       OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
 }
 
-template <typename T>
-class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-    dX->mutable_data<T>(ctx.GetPlace());
-    ddY->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-
-    const auto &x_dims = X->dims();
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int sample_size = X->numel() / C;
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    const T *mean_data = Saved_mean->data<T>();
-    const T *inv_var_data = Saved_variance->data<T>();
-
-    Tensor inv_var_tensor;
-    if (use_global_stats) {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_variance = ctx.Input<Tensor>("Variance");
-      mean_data = running_mean->data<T>();
-      inv_var_tensor.Resize({C});
-
-      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
-      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
-      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
-
-      inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
-      inv_var_data = running_inv_var_data;
-    }
-
-    // transpose NCHW -> NHWC for easy calculate
-    Tensor transformed_x(X->type());
-    Tensor transformed_dy(dY->type());
-    Tensor transformed_ddx(ddX->type());
-
-    Tensor transformed_dx(dX->type());
-    Tensor transformed_ddy(ddY->type());
-    if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-      // Input Tensor
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, X,
-                                                         &transformed_x);
-      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, X, &transformed_x);
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
-                                                         &transformed_dy);
-      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
-                                                        &transformed_dy);
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
-                                                         &transformed_ddx);
-      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
-                                                        &transformed_ddx);
-      // Output Tensor
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dX,
-                                                         &transformed_dx);
-      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddY,
-                                                         &transformed_ddy);
-    } else {
-      transformed_x.ShareDataWith(*X);
-      transformed_dy.ShareDataWith(*dY);
-      transformed_ddx.ShareDataWith(*ddX);
-
-      transformed_dx.ShareDataWith(*dX);
-      transformed_ddy.ShareDataWith(*ddY);
-    }
-
-    ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
-    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
-
-    Tensor mean_tile;
-    mean_tile.Resize({C, sample_size});
-    mean_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> mean_tile_data(mean_tile.mutable_data<T>(ctx.GetPlace()),
-                                    C, sample_size);
-
-    Tensor inv_var_tile;
-    inv_var_tile.Resize({C, sample_size});
-    inv_var_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> inv_var_tile_data(
-        inv_var_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-
-    mean_tile_data = mean_arr.replicate(1, sample_size);
-    inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
-
-    Tensor Scale_data;
-    if (!Scale) {
-      Scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
-    }
-    ConstEigenVectorArrayMap<T> scale_arr(
-        Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
-
-    Tensor scale_tile;
-    scale_tile.Resize({C, sample_size});
-    scale_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> scale_tile_data(scale_tile.mutable_data<T>(ctx.GetPlace()),
-                                     C, sample_size);
-    scale_tile_data = scale_arr.replicate(1, sample_size);
-
-    ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
-    ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
-
-    Tensor x_sub_mean_mul_invstd;
-    x_sub_mean_mul_invstd.Resize({C, sample_size});
-    x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
-        x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-    x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
-
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-      EigenArrayMap<T> dx_arr(transformed_dx.mutable_data<T>(ctx.GetPlace()), C,
-                              sample_size);
-      dx_arr.setZero();
-      if (use_global_stats) {
-        // math: dx = (ddscale * dy) * inv_var
-        if (ddScale) {
-          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-          Tensor ddscale_tile;
-          ddscale_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddscale_tile_data(
-              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
-
-          dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
-        }
-      } else {
-        // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
-        // axis=(n,h,w)) *
-        //          np.sum(dy, axis=(n,h,w)) -
-        //          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
-        //          mean),
-        //          axis=(n,h,w)) * inv_var.pow(2) *
-        //          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
-        //          NxHxW *
-        //          np.sum(ddx * (x - mean)) *
-        //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
-        //          np.sum(dy,
-        //          axis=(n,h,w)) * (x - mean) *
-        //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
-        //          inv_var
-        //          *
-        //          np.mean(dy, axis=(n,h,w)) -
-        //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-        //          axis=(n,h,w)))
-
-        if (ddX) {
-          dx_arr +=
-              (x_sub_mean_mul_invstd_arr * inv_var_tile_data *
-               inv_var_tile_data / sample_size)
-                  .colwise() *
-              (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
-               (dy_arr * ddx_arr).rowwise().sum() +
-               3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
-                   (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
-                   sample_size);
-
-          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
-                    (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
-                    sample_size *
-                    (dy_arr.rowwise().sum() / sample_size - dy_arr);
-
-          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
-                    (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
-                    sample_size *
-                    (ddx_arr.rowwise().sum() / sample_size - ddx_arr);
-
-          dx_arr = scale_tile_data * dx_arr;
-        }
-        if (ddScale) {
-          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-          Tensor ddscale_tile;
-          ddscale_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddscale_tile_data(
-              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
-
-          dx_arr += (dy_arr * inv_var_tile_data -
-                     (dy_arr.rowwise().sum().replicate(1, sample_size) /
-                      sample_size) *
-                         inv_var_tile_data -
-                     x_sub_mean_mul_invstd_arr * inv_var_tile_data *
-                         (dy_arr * x_sub_mean_mul_invstd_arr)
-                             .rowwise()
-                             .sum()
-                             .replicate(1, sample_size) /
-                         sample_size) *
-                    ddscale_tile_data;
-        }
-      }
-      if (data_layout == DataLayout::kNCHW) {
-        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
-        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
-            ctx, &transformed_dx, dX);
-      }
-    }
-    if (dScale) {
-      dScale->mutable_data<T>(ctx.GetPlace());
-      EigenVectorArrayMap<T> dscale_arr(dScale->mutable_data<T>(ctx.GetPlace()),
-                                        C);
-      dscale_arr.setZero();
-      if (use_global_stats) {
-        // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
-        if (ddX) {
-          dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
-        }
-      } else {
-        // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
-        //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
-        //            ddx
-        if (ddX) {
-          Tensor first_grad;
-          first_grad.Resize({C, sample_size});
-          EigenArrayMap<T> first_grad_arr(
-              first_grad.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          first_grad_arr.setZero();
-
-          first_grad_arr +=
-              inv_var_tile_data *
-              (dy_arr -
-               dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
-               x_sub_mean_mul_invstd_arr *
-                   (dy_arr * x_sub_mean_mul_invstd_arr)
-                       .rowwise()
-                       .sum()
-                       .replicate(1, sample_size) /
-                   sample_size);
-          dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
-        }
-      }
-    }
-
-    if (ddY) {
-      ddY->mutable_data<T>(ctx.GetPlace());
-      EigenArrayMap<T> ddy_arr(transformed_ddy.mutable_data<T>(ctx.GetPlace()),
-                               C, sample_size);
-      ddy_arr.setZero();
-      if (use_global_stats) {
-        // math: ddy = r * ddx * inv_var + ddbias +
-        //           ddscale * (x - mean) * inv_var
-        if (ddX) {
-          ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
-        }
-      } else {
-        // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
-        //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
-        //           np.mean(ddx * (x - mean), axis=(n,h,w)))
-        if (ddX) {
-          ddy_arr +=
-              scale_tile_data * inv_var_tile_data *
-              (ddx_arr -
-               ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
-               x_sub_mean_mul_invstd_arr *
-                   (ddx_arr * x_sub_mean_mul_invstd_arr)
-                       .rowwise()
-                       .sum()
-                       .replicate(1, sample_size) /
-                   sample_size);
-        }
-      }
-      if (ddScale) {
-        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-        Tensor ddscale_tile;
-        ddscale_tile.Resize({C, sample_size});
-        EigenArrayMap<T> ddscale_tile_data(
-            ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
-
-        ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
-      }
-
-      if (ddBias) {
-        ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-        Tensor ddbias_tile;
-        ddbias_tile.Resize({C, sample_size});
-        EigenArrayMap<T> ddbias_tile_data(
-            ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-        ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
-
-        ddy_arr += ddbias_tile_data;
-      }
-
-      if (data_layout == DataLayout::kNCHW) {
-        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
-        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
-            ctx, &transformed_ddy, ddY);
-      }
-    }
-  }
-};
-
 DECLARE_INPLACE_OP_INFERER(BatchNormDoubleGradOpInplaceInferer, {"DY", "DDY"});
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(batch_norm, BatchNormInferShapeFunctor,
+                            PD_INFER_META(phi::BatchNormInferMeta));
+
 REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
                   ops::BatchNormOpInferVarType,
                   ops::BatchNormGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index f8d37d685b929..d274e8d2c006d 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -113,23 +113,5 @@ class BatchNormOpInferVarType
   }
 };
 
-template <typename DeviceContext, typename T>
-class BatchNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class BatchNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class BatchNormDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override;
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc
index 0e64b461786cc..6507890a8b5dc 100644
--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
@@ -20,6 +21,8 @@ namespace operators {
 
 template <typename T>
 class MLUBatchNormOpKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto &place = ctx.GetPlace();
@@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
 
     // alloc memory
     y->mutable_data<T>(place);
-    mean_out->mutable_data<T>(place);
-    variance_out->mutable_data<T>(place);
-    saved_mean->mutable_data<T>(place);
-    saved_variance->mutable_data<T>(place);
+    mean_out->mutable_data<MPDType>(place);
+    variance_out->mutable_data<MPDType>(place);
+    saved_mean->mutable_data<MPDType>(place);
+    saved_variance->mutable_data<MPDType>(place);
 
     Tensor transformed_x;
     Tensor transformed_y;
@@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
 
 template <typename T>
 class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *x = ctx.Input<Tensor>("X");
@@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
     auto d_x_tmp =
         ctx.AllocateTmpTensor<T, MLUDeviceContext>(x->dims(), dev_ctx);
-    auto scale_grad_tmp =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(scale->dims(), dev_ctx);
+    auto scale_grad_tmp = ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(
+        scale->dims(), dev_ctx);
     auto bias_grad_tmp =
-        ctx.AllocateTmpTensor<T, MLUDeviceContext>(bias->dims(), dev_ctx);
+        ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(bias->dims(), dev_ctx);
 
     if (d_x == nullptr) {
       d_x = &d_x_tmp;
@@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
 
     const auto &place = ctx.GetPlace();
     d_x->mutable_data<T>(place);
-    d_scale->mutable_data<T>(place);
-    d_bias->mutable_data<T>(place);
+    d_scale->mutable_data<MPDType>(place);
+    d_bias->mutable_data<MPDType>(place);
 
     use_global_stats = is_test || use_global_stats;
 
diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
index a70b6e991161d..ae03ecbcb16a0 100644
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -76,10 +76,10 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
       auto *variance_out = ctx.Output<Tensor>("VarianceOut");
       auto *saved_mean = ctx.Output<Tensor>("SavedMean");
       auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-      mean_out->mutable_data<T>(ctx.GetPlace());
-      variance_out->mutable_data<T>(ctx.GetPlace());
-      saved_mean->mutable_data<T>(ctx.GetPlace());
-      saved_variance->mutable_data<T>(ctx.GetPlace());
+      mean_out->mutable_data<float>(ctx.GetPlace());
+      variance_out->mutable_data<float>(ctx.GetPlace());
+      saved_mean->mutable_data<float>(ctx.GetPlace());
+      saved_variance->mutable_data<float>(ctx.GetPlace());
 
       // if MomentumTensor is set, use MomentumTensor value, momentum
       // is only used in this training branch
@@ -170,8 +170,8 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
+      d_scale->mutable_data<float>(ctx.GetPlace());
+      d_bias->mutable_data<float>(ctx.GetPlace());
       if (use_global_stats) {
         const auto *running_mean = ctx.Input<Tensor>("Mean");
         const auto *running_variance = ctx.Input<Tensor>("Variance");
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
index edf854a9c95b0..8139530b809ab 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
@@ -24,7 +24,9 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnInstructionRun");
+    // The cinn-graph may hasn't input for CINN now support fill_constant,
+    // and its all inputs may generated by fill_constant instead of by fetch.
+    // OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnInstructionRun");
     OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs,
                    "CinnInstructionRun");
     const CinnCompiledObject& compiled_object =
@@ -43,6 +45,53 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel {
                    });
     ctx->SetOutputsDim(kOutputs, output_dims);
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // Why we need override GetExpectedKernelType?
+    // A cinn-graph may has no inpute var, if we use the base function,
+    // it will check wheter input tensors is initialized. Here we rewrite
+    // the function so that we can infer kernel type by output date type.
+    if (ctx.InputSize(kX)) {
+      // if the instruction has input, infer kernel type by input date type:
+      return OperatorWithKernel::GetExpectedKernelType(ctx);
+    }
+
+    // Else infer kernel type by output date type:
+    // The `OutputVar` will check wheter the kOutputs iff has one output var
+    const framework::Variable* var = ctx.OutputVar(kOutputs);
+    PADDLE_ENFORCE_NE(
+        var, nullptr,
+        platform::errors::InvalidArgument(
+            "The cinn_instruction_run Op's Output Variable should not empty."));
+
+    const framework::Tensor* tensor = nullptr;
+    if (var->IsType<framework::Tensor>()) {
+      tensor = &var->Get<framework::Tensor>();
+    } else if (var->IsType<framework::LoDTensor>()) {
+      tensor = &var->Get<framework::LoDTensor>();
+    } else if (var->IsType<phi::SelectedRows>()) {
+      tensor = &(var->Get<phi::SelectedRows>().value());
+    } else if (var->IsType<framework::LoDTensorArray>()) {
+      auto t_arr = &var->Get<framework::LoDTensorArray>();
+      PADDLE_ENFORCE_EQ(t_arr->size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "The cinn_instruction_run Op should just has One "
+                            "Output when Input empty."));
+      tensor = &(t_arr->front());
+    }
+
+    PADDLE_ENFORCE_NE(
+        tensor, nullptr,
+        platform::errors::InvalidArgument(
+            "The cinn_instruction_run Op's Output Tensor should not empty."));
+
+    VLOG(4) << "The tensor [" << ctx.OutputName(kOutputs) << "]'s dtype is "
+            << paddle::framework::DataType2String(tensor->dtype());
+    auto output_type = paddle::framework::TransToProtoVarType(tensor->dtype());
+    return framework::OpKernelType(output_type, ctx.device_context());
+  }
 };
 
 class CinnInstructionRunOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc
index d918b7216c4d2..5d006a947be19 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -87,9 +87,12 @@ class CinnLaunchOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs(kX) || ctx->HasInputs(kNoNeedBufferX),
-                   "Input", string::format_string("%s|%s", kX, kNoNeedBufferX),
-                   "CinnLaunchOp");
+    // The cinn-graph may hasn't input for CINN now support fill_constant,
+    // and its all inputs may generated by fill_constant instead of by fetch.
+    // OP_INOUT_CHECK(ctx->HasInputs(kX) || ctx->HasInputs(kNoNeedBufferX),
+    //                "Input", string::format_string("%s|%s", kX,
+    //                kNoNeedBufferX),
+    //                "CinnLaunchOp");
     OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs,
                    "CinnLaunchOp");
   }
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index bc29c92b09426..8a190c1a1e091 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -40,6 +40,13 @@ class FeedVariableVisitor : public boost::static_visitor<void> {
         out_var_->GetMutable<framework::LoDTensor>();
     if (platform::is_same_place(in_tensor.place(), place_)) {
       out_tensor->ShareDataWith(in_tensor);
+#ifdef PADDLE_WITH_IPU
+    } else if (platform::is_ipu_place(place_)) {
+      // For ipu, both in_tensor and out_tensor are allocated on cpu,
+      // PopART will copy tensor from host automatically,
+      // no TensorCopy() is required here.
+      out_tensor->ShareDataWith(in_tensor);
+#endif
     } else {
       platform::DeviceContext *context =
           platform::DeviceContextPool::Instance().Get(place_);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 8213e877f7224..9be63a85fc0de 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -27,6 +27,9 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -841,6 +844,8 @@ framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType(
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(conv2d, Conv2dInferShapeFunctor,
+                            PD_INFER_META(phi::ConvInferMeta));
 REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
                   ops::ConvOpInferVarType,
                   ops::Conv2DGradMaker<paddle::framework::OpDesc>,
@@ -851,6 +856,8 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad,
 REGISTER_OPERATOR(conv2d_grad_grad, ops::ConvOpDoubleGrad);
 
 // depthwise convolution op
+DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d, DepthwiseConv2dInferShapeFunctor,
+                            PD_INFER_META(phi::ConvInferMeta));
 REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
                   ops::ConvOpInferVarType,
                   ops::Conv2DGradMaker<paddle::framework::OpDesc>,
@@ -860,6 +867,8 @@ REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad,
                   ops::Conv2DDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(depthwise_conv2d_grad_grad, ops::ConvOpDoubleGrad);
 
+DECLARE_INFER_SHAPE_FUNCTOR(conv3d, Conv3dInferShapeFunctor,
+                            PD_INFER_META(phi::ConvInferMeta));
 REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
                   ops::ConvOpInferVarType,
                   ops::Conv3DGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
index 8897f7b229c32..fcda16a3e72ac 100644
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -356,7 +356,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
+      filter_grad->mutable_data<float>(ctx.GetPlace());
       std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
 
       const auto& runner = NpuOpRunner(
diff --git a/paddle/fluid/operators/conv_op_xpu.cc b/paddle/fluid/operators/conv_op_xpu.cc
index ddfc6fe862c27..e4751f1f26008 100644
--- a/paddle/fluid/operators/conv_op_xpu.cc
+++ b/paddle/fluid/operators/conv_op_xpu.cc
@@ -19,14 +19,16 @@ namespace operators {
 
 template <typename DeviceContext, typename T>
 class GemmConvXPUKernel : public framework::OpKernel<T> {
+  using XPUT = typename XPUTypeTrait<T>::Type;
+
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *input = context.Input<Tensor>("Input");
     // The filter will be reshaped in the calculations,
     // so here use an assignment operation,
     // that avoids modifying the variable in the Scope.
     Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
+    Tensor *output = context.Output<Tensor>("Output");
     output->mutable_data<T>(context.GetPlace());
     int groups = context.Attr<int>("groups");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
@@ -53,11 +55,16 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
     const int img_h = static_cast<int>(input->dims()[2]);
     const int img_w = static_cast<int>(input->dims()[3]);
     const int f = static_cast<int>(filter.dims()[0]);
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = xpu::conv2d<float, float, float, int16_t>(
-        dev_ctx.x_context(), input->data<float>(), filter.data<float>(),
-        output->data<float>(), batch_size, img_c, img_h, img_w, f, ksize,
-        strides, paddings, dilations, groups, nullptr, nullptr, nullptr, true);
+
+    const XPUT *input_data = reinterpret_cast<const XPUT *>(input->data<T>());
+    const XPUT *filter_data = reinterpret_cast<const XPUT *>(filter.data<T>());
+    XPUT *output_data = reinterpret_cast<XPUT *>(output->data<T>());
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::conv2d<XPUT, XPUT, XPUT, int16_t>(
+        dev_ctx.x_context(), input_data, filter_data, output_data, batch_size,
+        img_c, img_h, img_w, f, ksize, strides, paddings, dilations, groups,
+        nullptr, nullptr, nullptr, true);
     PADDLE_ENFORCE_EQ(
         r, XPU_SUCCESS,
         platform::errors::External("XPU conv kernel return wrong value[%d %s]",
@@ -67,14 +74,16 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
 
 template <typename DeviceContext, typename T>
 class GemmConvGradXPUKernel : public framework::OpKernel<T> {
+  using XPUT = typename XPUTypeTrait<T>::Type;
+
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *input = context.Input<Tensor>("Input");
+    const Tensor *output_grad =
         context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
+    Tensor *input_grad =
         context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
+    Tensor *filter_grad =
         context.Output<Tensor>(framework::GradVarName("Filter"));
     // The filter and filter_grad will be reshaped in the calculations,
     // so here use an assignment operation,
@@ -107,19 +116,27 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
     const int img_h = static_cast<int>(input->dims()[2]);
     const int img_w = static_cast<int>(input->dims()[3]);
     const int f = static_cast<int>(filter.dims()[0]);
+
+    const XPUT *input_data = reinterpret_cast<const XPUT *>(input->data<T>());
+    const XPUT *filter_data = reinterpret_cast<const XPUT *>(filter.data<T>());
+    const XPUT *output_grad_data =
+        reinterpret_cast<const XPUT *>(output_grad->data<T>());
+    XPUT *input_grad_data = nullptr;
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
+      input_grad_data = reinterpret_cast<XPUT *>(input_grad->data<T>());
     }
+    XPUT *filter_grad_data = nullptr;
     if (filter_grad) {
       filter_grad->mutable_data<T>(context.GetPlace());
+      filter_grad_data = reinterpret_cast<XPUT *>(filter_grad->data<T>());
     }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = xpu::conv2d_grad<float, float, float, int16_t>(
-        dev_ctx.x_context(), input->data<T>(), filter.data<T>(),
-        output_grad->data<T>(), input_grad ? input_grad->data<T>() : nullptr,
-        filter_grad ? filter_grad->data<T>() : nullptr, batch_size, img_c,
-        img_h, img_w, f, ksize, strides, paddings, dilations, groups, nullptr,
-        nullptr, nullptr, nullptr, nullptr, true);
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::conv2d_grad<XPUT, XPUT, XPUT, int16_t>(
+        dev_ctx.x_context(), input_data, filter_data, output_grad_data,
+        input_grad_data, filter_grad_data, batch_size, img_c, img_h, img_w, f,
+        ksize, strides, paddings, dilations, groups, nullptr, nullptr, nullptr,
+        nullptr, nullptr, true);
     PADDLE_ENFORCE_EQ(
         r, XPU_SUCCESS,
         platform::errors::External("XPU conv kernel return wrong value[%d %s]",
@@ -130,14 +147,22 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
-    depthwise_conv2d,
-    ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    conv2d, ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    conv2d, ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext,
+                           paddle::platform::float16>);
 REGISTER_OP_XPU_KERNEL(
     conv2d_grad,
-    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext,
+                               paddle::platform::float16>);
+REGISTER_OP_XPU_KERNEL(
+    depthwise_conv2d,
+    ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext,
+                           paddle::platform::float16>);
 REGISTER_OP_XPU_KERNEL(
     depthwise_conv2d_grad,
-    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext,
+                               paddle::platform::float16>);
 #endif
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
deleted file mode 100644
index 1841b78af32dd..0000000000000
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ /dev/null
@@ -1,1286 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memory.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/operators/conv_miopen_helper.h"
-#else
-#include "paddle/fluid/operators/conv_cudnn_helper.h"
-#endif
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T, int D>
-static void DataTranspose(const framework::ExecutionContext& ctx,
-                          const Tensor* input, Tensor* output,
-                          const std::vector<int>& axis, int flag = 0) {
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  phi::funcs::Transpose<platform::CUDADeviceContext, T, D> transpose;
-  auto in_dims = input->dims();
-  std::vector<int64_t> input_transpose_vec;
-  for (size_t i = 0; i < axis.size(); ++i) {
-    if (flag == 0)
-      input_transpose_vec.push_back(in_dims[axis[i]]);
-    else
-      input_transpose_vec.push_back(in_dims[i]);
-  }
-  framework::DDim input_transpose_dims(phi::make_ddim(input_transpose_vec));
-  output->mutable_data<T>(input_transpose_dims, ctx.GetPlace());
-  transpose(dev_ctx, *input, output, axis);
-}
-
-template <typename T>
-class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-
-    // cudnn v5 does not support dilations
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const T* filter_data = filter->data<T>();
-    const std::string data_layout_str = ctx.Attr<std::string>("data_format");
-    const paddle::platform::DataLayout data_layout =
-        (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW
-                                   : platform::DataLayout::kNHWC);
-
-    // if channel_last, transpose to channel_first
-    Tensor input_transpose;
-    std::vector<int> input_vec = phi::vectorize<int>(input->dims());
-    std::vector<int> output_vec = phi::vectorize<int>(output->dims());
-    if (data_layout == platform::DataLayout::kNHWC) {
-      if (strides.size() == 2U) {
-        std::vector<int> axis = {0, 3, 1, 2};
-        for (size_t i = 0; i < axis.size(); ++i) {
-          input_vec[i] = input->dims()[axis[i]];
-          output_vec[i] = output->dims()[axis[i]];
-        }
-        DataTranspose<T, 4>(ctx, input, &input_transpose, axis);
-      } else if (strides.size() == 3U) {
-        std::vector<int> axis = {0, 4, 1, 2, 3};
-        for (size_t i = 0; i < axis.size(); ++i) {
-          input_vec[i] = input->dims()[axis[i]];
-          output_vec[i] = output->dims()[axis[i]];
-        }
-        DataTranspose<T, 5>(ctx, input, &input_transpose, axis);
-      }
-    } else {
-      input_transpose = *input;
-    }
-
-    // update padding and dilation
-    auto in_dims = input_transpose.dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-
-    std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
-    Tensor transformed_input;
-    std::vector<int> padding_common(data_dim, 0);
-    if (!is_sys_pad) {
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = input_transpose.dims()[0];
-      new_input_shape_vec[1] = input_transpose.dims()[1];
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        new_input_shape_vec[i + 2] =
-            input_transpose.dims()[i + 2] + padding_diff[i];
-        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_input.Resize(new_input_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-
-      transformed_input =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      const int rank = input_transpose.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, input_transpose, pad_value,
-              &transformed_input);
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, input_transpose, pad_value,
-              &transformed_input);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor."));
-      }
-    } else {
-      transformed_input = input_transpose;
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    std::vector<int64_t> starts(data_dim, 0);
-    std::vector<int64_t> ends(data_dim, 0);
-    std::vector<int64_t> axes(data_dim, 0);
-    for (size_t i = 0; i < data_dim; ++i) {
-      starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
-      ends[i] = starts[i] + output_vec[i + 2];
-      axes[i] = i + 2;
-    }
-
-    const T* input_data = transformed_input.data<T>();
-    input_vec = phi::vectorize<int>(transformed_input.dims());
-
-    std::vector<int> transformed_output_vec = output_vec;
-    for (size_t i = 0; i < data_dim; ++i) {
-      transformed_output_vec[i + 2] =
-          output_vec[i + 2] +
-          (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
-          2 * padding_common[i] + paddings[2 * i] + paddings[2 * i + 1];
-    }
-
-    Tensor transformed_output;
-    if (!is_sys_pad) {
-      DDim transformed_output_shape(phi::make_ddim(transformed_output_vec));
-      transformed_output.mutable_data<T>(transformed_output_shape,
-                                         ctx.GetPlace());
-    } else {
-      output->mutable_data<T>(ctx.GetPlace());
-      transformed_output.ShareDataWith(*output);
-      transformed_output.Resize(phi::make_ddim(transformed_output_vec));
-    }
-    T* transformed_output_data = transformed_output.data<T>();
-
-    platform::DataLayout layout;
-
-    int iwo_groups = groups;
-    int c_groups = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_groups = 1;
-    c_groups = groups;
-    groups = 1;
-#endif
-
-    if (strides.size() == 2U) {
-      layout = platform::DataLayout::kNCHW;
-    } else {
-      layout = platform::DataLayout::kNCDHW;
-    }
-
-    size_t workspace_size = 0;
-#ifdef PADDLE_WITH_HIP
-    miopenConvBwdDataAlgorithm_t algo{};
-#else
-    cudnnConvolutionBwdDataAlgo_t algo{};
-#endif
-    // ------------------- cudnn conv algorithm ---------------------
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    auto layout_tensor = GetCudnnTensorFormat(layout);
-    bool deterministic = FLAGS_cudnn_deterministic;
-
-    auto dtype = platform::CudnnDataType<T>::type;
-    // ------------------- cudnn descriptors ---------------------
-    ConvArgs args{&transformed_output,
-                  filter,
-                  &transformed_input,
-                  strides,
-                  padding_common,
-                  dilations,
-                  dtype};
-    args.handle = handle;
-    args.idesc.set(transformed_output, iwo_groups);
-    args.wdesc.set(*filter, layout_tensor, iwo_groups);
-    args.odesc.set(transformed_input, iwo_groups);
-    args.cdesc.set(dtype, padding_common, strides, dilations,
-                   platform::AllowTF32Cudnn(), c_groups);
-
-#ifdef PADDLE_WITH_HIP
-    using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-    workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
-    algo = search::Find<T>(
-        args, false, deterministic, workspace_size,
-        ctx.template device_context<platform::CUDADeviceContext>());
-#else
-    using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    algo = search::Find<T>(
-        args, false, deterministic,
-        ctx.template device_context<platform::CUDADeviceContext>());
-    workspace_size =
-        std::max(workspace_size, search::GetWorkspaceSize(args, algo));
-#endif
-
-    // ------------------- cudnn conv transpose forward ---------------------
-    int input_offset =
-        transformed_input.numel() / transformed_input.dims()[0] / groups;
-    int output_offset =
-        transformed_output.numel() / transformed_output.dims()[0] / groups;
-    int filter_offset = filter->numel() / groups;
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    for (int g = 0; g < groups; g++) {
-#ifdef PADDLE_WITH_HIP
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::miopenConvolutionBackwardData(
-                handle, &alpha, args.odesc.desc(),
-                input_data + input_offset * g, args.wdesc.desc(),
-                filter_data + filter_offset * g, args.cdesc.desc(), algo, &beta,
-                args.idesc.desc(), transformed_output_data + output_offset * g,
-                cudnn_workspace, workspace_size));
-      };
-#else   // PADDLE_WITH_HIP
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnConvolutionBackwardData(
-                handle, &alpha, args.wdesc.desc(),
-                filter_data + filter_offset * g, args.odesc.desc(),
-                input_data + input_offset * g, args.cdesc.desc(), algo,
-                cudnn_workspace, workspace_size, &beta, args.idesc.desc(),
-                transformed_output_data + output_offset * g));
-      };
-#endif  // PADDLE_WITH_HIP
-      workspace_handle.RunFunc(cudnn_func, workspace_size);
-    }
-    if (!is_sys_pad && strides.size() == 2U) {
-      Slice<paddle::platform::CUDADeviceContext, T, 4>(
-          ctx, &transformed_output, output, starts, ends, axes);
-    } else if (!is_sys_pad && strides.size() == 3U) {
-      Slice<paddle::platform::CUDADeviceContext, T, 5>(
-          ctx, &transformed_output, output, starts, ends, axes);
-    }
-
-    if (data_layout == platform::DataLayout::kNHWC) {
-      Tensor output_transpose;
-      Tensor output_nchw;
-      output_nchw.ShareDataWith(*output);
-      output_nchw.Resize(phi::make_ddim(output_vec));
-      if (strides.size() == 2U) {
-        std::vector<int> axis = {0, 2, 3, 1};
-        DataTranspose<T, 4>(ctx, &output_nchw, &output_transpose, axis);
-        *output = output_transpose;
-      } else if (strides.size() == 3U) {
-        std::vector<int> axis = {0, 2, 3, 4, 1};
-        DataTranspose<T, 5>(ctx, &output_nchw, &output_transpose, axis);
-        *output = output_transpose;
-      }
-    }
-  }
-};
-
-template <typename T>
-class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-    const T* filter_data = filter->data<T>();
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    // cudnn v5 does not support dilations
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_format");
-    const paddle::platform::DataLayout data_layout =
-        (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW
-                                   : platform::DataLayout::kNHWC);
-
-    // if channel_last, transpose to channel_first
-    Tensor input_transpose;
-    Tensor output_grad_transpose;
-    std::vector<int> input_vec = phi::vectorize<int>(input->dims());
-    std::vector<int> output_vec = phi::vectorize<int>(output_grad->dims());
-    if (data_layout == platform::DataLayout::kNHWC) {
-      if (strides.size() == 2U) {
-        std::vector<int> axis = {0, 3, 1, 2};
-        for (size_t i = 0; i < axis.size(); ++i) {
-          input_vec[i] = input->dims()[axis[i]];
-          output_vec[i] = output_grad->dims()[axis[i]];
-        }
-        DataTranspose<T, 4>(ctx, input, &input_transpose, axis);
-        DataTranspose<T, 4>(ctx, output_grad, &output_grad_transpose, axis);
-      } else if (strides.size() == 3U) {
-        std::vector<int> axis = {0, 4, 1, 2, 3};
-        for (size_t i = 0; i < axis.size(); ++i) {
-          input_vec[i] = input->dims()[axis[i]];
-          output_vec[i] = output_grad->dims()[axis[i]];
-        }
-        DataTranspose<T, 5>(ctx, input, &input_transpose, axis);
-        DataTranspose<T, 5>(ctx, output_grad, &output_grad_transpose, axis);
-      }
-    } else {
-      input_transpose = *input;
-      output_grad_transpose = *output_grad;
-    }
-
-    // update padding and dilation
-    auto in_dims = input_transpose.dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-
-    std::vector<int> input_pad(input_transpose.dims().size() * 2, 0);
-    Tensor transformed_output_grad;
-    std::vector<int> padding_common(data_dim, 0);
-    if (!is_sys_pad) {
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_output_grad_shape_vec(data_dim + 2);
-      new_output_grad_shape_vec[0] = output_grad_transpose.dims()[0];
-      new_output_grad_shape_vec[1] = output_grad_transpose.dims()[1];
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        new_output_grad_shape_vec[i + 2] =
-            output_grad_transpose.dims()[i + 2] + padding_diff[i];
-        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-      }
-      framework::DDim new_output_grad_shape(
-          phi::make_ddim(new_output_grad_shape_vec));
-      transformed_output_grad.Resize(new_output_grad_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-
-      transformed_output_grad =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_output_grad_shape, dev_ctx);
-      const int rank = input_transpose.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, output_grad_transpose, pad_value,
-              &transformed_output_grad);
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, output_grad_transpose, pad_value,
-              &transformed_output_grad);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Op(ConvTranspose) only supports 4-D or 5-D input Tensor."));
-      }
-    } else {
-      transformed_output_grad = output_grad_transpose;
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    const T* input_data = input_transpose.data<T>();
-    const T* output_grad_data = transformed_output_grad.data<T>();
-    output_vec = phi::vectorize<int>(transformed_output_grad.dims());
-
-    // ------------------- cudnn descriptors ---------------------
-    platform::DataLayout layout;
-
-    if (strides.size() == 2U) {
-      layout = platform::DataLayout::kNCHW;
-    } else {
-      layout = platform::DataLayout::kNCDHW;
-    }
-
-    int iwo_groups = groups;
-    int c_groups = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_groups = 1;
-    c_groups = groups;
-    groups = 1;
-#endif
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-    ConvArgs args1{&transformed_output_grad,
-                   filter,
-                   &input_transpose,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{&transformed_output_grad,
-                   filter,
-                   &input_transpose,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-
-#ifdef PADDLE_WITH_HIP
-    miopenConvFwdAlgorithm_t data_algo{};
-    miopenConvBwdWeightsAlgorithm_t filter_algo{};
-#else
-    cudnnConvolutionFwdAlgo_t data_algo{};
-    cudnnConvolutionBwdFilterAlgo_t filter_algo{};
-#endif
-
-    auto layout_tensor = GetCudnnTensorFormat(layout);
-    size_t workspace_size = 0;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto handle = dev_ctx.cudnn_handle();
-    bool deterministic = FLAGS_cudnn_deterministic;
-    T* input_grad_data = nullptr;
-    T* filter_grad_data = nullptr;
-
-    if (input_grad) {
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      args1.handle = handle;
-      args1.idesc.set(transformed_output_grad, iwo_groups);
-      args1.wdesc.set(*filter, layout_tensor, iwo_groups);
-      args1.odesc.set(input_transpose, iwo_groups);
-      args1.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-#ifdef PADDLE_WITH_HIP
-      using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search1::GetWorkspaceSize(args1));
-      data_algo = search1::Find<T>(
-          args1, false, deterministic, workspace_size,
-          ctx.template device_context<platform::CUDADeviceContext>());
-#else
-      using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search1::Find<T>(
-          args1, false, deterministic,
-          ctx.template device_context<platform::CUDADeviceContext>());
-      workspace_size =
-          std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
-#endif
-    }
-
-    if (filter_grad) {
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      args2.handle = handle;
-      args2.idesc.set(transformed_output_grad, iwo_groups);
-      args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups);
-      args2.odesc.set(input_transpose, iwo_groups);
-      args2.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-#ifdef PADDLE_WITH_HIP
-      using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search2::GetWorkspaceSize(args2));
-      filter_algo = search2::Find<T>(
-          args2, false, deterministic, workspace_size,
-          ctx.template device_context<platform::CUDADeviceContext>());
-#else
-      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search2::Find<T>(
-          args2, false, deterministic,
-          ctx.template device_context<platform::CUDADeviceContext>());
-      workspace_size = std::max(workspace_size,
-                                search2::GetWorkspaceSize(args2, filter_algo));
-#endif
-    }
-
-    // ------------------- cudnn conv backward data ---------------------
-    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
-    int input_offset = input->numel() / input->dims()[0] / groups;
-    int output_grad_offset = transformed_output_grad.numel() /
-                             transformed_output_grad.dims()[0] / groups;
-    int filter_offset = filter->numel() / groups;
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    if (input_grad) {
-      // Because beta is zero, it is unnecessary to reset input_grad.
-      for (int g = 0; g < groups; g++) {
-#ifdef PADDLE_WITH_HIP
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::miopenConvolutionForward(
-                  handle, &alpha, args1.idesc.desc(),
-                  output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
-                  filter_data + filter_offset * g, args1.cdesc.desc(),
-                  data_algo, &beta, args1.odesc.desc(),
-                  input_grad_data + input_offset * g, cudnn_workspace,
-                  workspace_size));
-        };
-#else   // PADDLE_WITH_HIP
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnConvolutionForward(
-              handle, &alpha, args1.idesc.desc(),
-              output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
-              filter_data + filter_offset * g, args1.cdesc.desc(), data_algo,
-              cudnn_workspace, workspace_size, &beta, args1.odesc.desc(),
-              input_grad_data + input_offset * g));
-        };
-#endif  // PADDLE_WITH_HIP
-        workspace_handle.RunFunc(cudnn_func, workspace_size);
-      }
-
-      if (data_layout == platform::DataLayout::kNHWC) {
-        Tensor input_grad_transpose;
-        Tensor input_grad_nchw;
-        input_grad_nchw.ShareDataWith(*input_grad);
-        input_grad_nchw.Resize(phi::make_ddim(input_vec));
-        if (strides.size() == 2U) {
-          std::vector<int> axis = {0, 2, 3, 1};
-          DataTranspose<T, 4>(ctx, &input_grad_nchw, &input_grad_transpose,
-                              axis);
-          *input_grad = input_grad_transpose;
-        } else if (strides.size() == 3U) {
-          std::vector<int> axis = {0, 2, 3, 4, 1};
-          DataTranspose<T, 5>(ctx, &input_grad_nchw, &input_grad_transpose,
-                              axis);
-          *input_grad = input_grad_transpose;
-        }
-      }
-    }
-
-    // ------------------- cudnn conv backward filter ---------------------
-    if (filter_grad) {
-      // Because beta is zero, it is unnecessary to reset filter_grad.
-      // Gradient with respect to the filter
-      for (int g = 0; g < groups; g++) {
-#ifdef PADDLE_WITH_HIP
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::miopenConvolutionBackwardWeights(
-                  handle, &alpha, args2.odesc.desc(),
-                  input_data + input_offset * g, args2.idesc.desc(),
-                  output_grad_data + output_grad_offset * g, args2.cdesc.desc(),
-                  filter_algo, &beta, args2.wdesc.desc(),
-                  filter_grad_data + filter_offset * g, cudnn_workspace,
-                  workspace_size));
-        };
-#else   // PADDLE_WITH_HIP
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnConvolutionBackwardFilter(
-                  handle, &alpha, args2.idesc.desc(),
-                  output_grad_data + output_grad_offset * g, args2.odesc.desc(),
-                  input_data + input_offset * g, args2.cdesc.desc(),
-                  filter_algo, cudnn_workspace, workspace_size, &beta,
-                  args2.wdesc.desc(), filter_grad_data + filter_offset * g));
-        };
-#endif  // PADDLE_WITH_HIP
-        workspace_handle.RunFunc(cudnn_func, workspace_size);
-      }
-    }
-  }
-};
-
-/*
- * Inputs:  I, W, dO, ddI, ddW
- * Outputs: ddO, dW, dI
- * ddo = conv_bp_data(W, ddI) + conv_bp_data(ddW, I)
- * dW = conv_bp_filter(dO, ddI)
- * dI = conv(dO, ddW)
- */
-template <typename T>
-class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto X = ctx.Input<Tensor>("Input");
-    auto W = ctx.Input<Tensor>("Filter");
-    auto dO = ctx.Input<Tensor>("DOutput");
-    auto ddX = ctx.Input<Tensor>("DDInput");
-    auto ddW = ctx.Input<Tensor>("DDFilter");
-
-    auto ddO = ctx.Output<Tensor>("DDOutput");
-    auto dW = ctx.Output<Tensor>("DFilter");
-    auto dX = ctx.Output<Tensor>("DInput");
-
-    if (ddO) {
-      ddO->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-      set_zero(dev_ctx, ddO, static_cast<T>(0));
-    }
-    if (dW) {
-      dW->mutable_data<T>(ctx.GetPlace());
-    }
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-    }
-
-    const T* dy = dO->data<T>();
-    const T* w = W->data<T>();
-
-    const T* ddx = nullptr;
-    const T* ddw = nullptr;
-    T *dw, *dx, *ddy;
-    dw = dx = ddy = nullptr;
-    T* transformed_dx = nullptr;
-    const std::vector<int>& strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-
-    bool deterministic = FLAGS_cudnn_deterministic;
-
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // transform Tensors to channel first-----------
-    Tensor transformed_X_channel(X->type());
-    Tensor transformed_dO_channel(dO->type());
-    Tensor transformed_ddX_channel(X->type());
-
-    Tensor transformed_ddO_channel(dO->type());
-    Tensor transformed_dX_channel(X->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-
-      if (ddX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-        TransToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-      }
-
-      if (ddO) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddO, &transformed_ddO_channel);
-      }
-      if (dX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, dX, &transformed_dX_channel);
-        transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
-      }
-
-    } else {
-      transformed_X_channel = *X;
-      transformed_dO_channel = *dO;
-      if (ddX) {
-        transformed_ddX_channel = *ddX;
-      }
-      if (dX) {
-        transformed_dX_channel = *dX;
-      }
-    }
-    std::vector<int> output_vec =
-        phi::vectorize<int>(transformed_dO_channel.dims());
-
-    auto in_dims = transformed_X_channel.dims();
-    auto filter_dims = W->dims();
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-    Tensor transformed_X(X->type());
-    Tensor transformed_ddX(X->type());
-
-    Tensor transformed_dO(dO->type());
-
-    std::vector<int> padding_common(data_dim, 0);
-    std::vector<int> input_pad(X->dims().size() * 2, 0);
-
-    if (!is_sys_pad) {
-      // get pad
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      std::vector<int> new_output_grad_shape_vec(data_dim + 2);
-
-      new_input_shape_vec[0] = transformed_X_channel.dims()[0];
-      new_input_shape_vec[1] = transformed_X_channel.dims()[1];
-
-      new_output_grad_shape_vec[0] = transformed_dO_channel.dims()[0];
-      new_output_grad_shape_vec[1] = transformed_dO_channel.dims()[1];
-
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        new_input_shape_vec[i + 2] =
-            transformed_X_channel.dims()[i + 2] + padding_diff[i];
-
-        new_output_grad_shape_vec[i + 2] =
-            transformed_dO_channel.dims()[i + 2] + padding_diff[i];
-
-        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_X.Resize(new_input_shape);
-      transformed_ddX.Resize(new_input_shape);
-
-      framework::DDim new_output_grad_shape(
-          phi::make_ddim(new_output_grad_shape_vec));
-      transformed_dO.Resize(new_output_grad_shape);
-
-      transformed_dO =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_output_grad_shape, dev_ctx);
-
-      transformed_X =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      if (ddX) {
-        transformed_ddX =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-
-      // pad for input
-      const int rank = X->dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, transformed_X_channel, pad_value,
-              &transformed_X);
-          if (dO) {
-            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                dev_ctx, input_pad, transformed_dO_channel, pad_value,
-                &transformed_dO);
-          }
-
-          if (ddX) {
-            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, transformed_X_channel, pad_value,
-              &transformed_X);
-          if (ddX) {
-            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-
-    } else {
-      transformed_X = transformed_X_channel;
-      transformed_dO = transformed_dO_channel;
-      if (ddX) {
-        transformed_ddX = transformed_ddX_channel;
-      }
-
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-
-    std::vector<int64_t> starts(data_dim, 0);
-    std::vector<int64_t> ends(data_dim, 0);
-    std::vector<int64_t> axes(data_dim, 0);
-    for (size_t i = 0; i < data_dim; ++i) {
-      starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
-      ends[i] = starts[i] + output_vec[i + 2];
-      axes[i] = i + 2;
-    }
-
-    std::vector<int> transformed_output_vec = output_vec;
-    for (size_t i = 0; i < data_dim; ++i) {
-      transformed_output_vec[i + 2] =
-          output_vec[i + 2] +
-          (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
-          2 * padding_common[i] + paddings[2 * i] + paddings[2 * i + 1];
-    }
-
-    if (!is_sys_pad) {
-      DDim transformed_output_shape(phi::make_ddim(transformed_output_vec));
-      transformed_ddO_channel.mutable_data<T>(transformed_output_shape,
-                                              ctx.GetPlace());
-    } else {
-      ddO->mutable_data<T>(ctx.GetPlace());
-      transformed_ddO_channel = *ddO;
-      transformed_ddO_channel.Resize(phi::make_ddim(transformed_output_vec));
-    }
-
-    const T* x = transformed_X.data<T>();
-
-    int iwo_group = groups;
-    int c_group = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_group = 1;
-    c_group = groups;
-    groups = 1;
-#endif
-    auto dtype = platform::CudnnDataType<T>::type;
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    ConvArgs args1{&transformed_ddO_channel,
-                   W,
-                   &transformed_ddX,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{&transformed_ddO_channel, ddW,       &transformed_X, strides,
-                   padding_common,           dilations, dtype};
-
-    ConvArgs args3{&transformed_dO,
-                   dW,
-                   &transformed_ddX_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args4{
-        &transformed_dO, ddW,  &transformed_dX_channel, strides, padding_common,
-        dilations,       dtype};
-#ifdef PADDLE_WITH_HIP
-    miopenConvBwdDataAlgorithm_t bwd_algo1 =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvBwdDataAlgorithm_t bwd_algo2 =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvFwdAlgorithm_t data_algo =
-        static_cast<miopenConvFwdAlgorithm_t>(0);
-    miopenConvBwdWeightsAlgorithm_t filter_algo =
-        static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
-#else
-    cudnnConvolutionBwdDataAlgo_t bwd_algo1 =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdDataAlgo_t bwd_algo2 =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionFwdAlgo_t data_algo =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-#endif
-
-    auto layout = GetCudnnTensorFormat(platform::DataLayout::kNCHW);
-
-    // ddo = conv(ddI, W) + conv(I, ddW)
-    size_t workspace_size = 0;
-
-    T* transformed_ddy_channel = nullptr;
-
-    if (ddO) {
-      ddy = ddO->data<T>();
-      transformed_ddy_channel = transformed_ddO_channel.data<T>();
-      if (ddX) {
-        args1.handle = handle;
-        args1.idesc.set(transformed_ddO_channel, iwo_group);
-        args1.wdesc.set(*W, layout, iwo_group);
-        args1.odesc.set(transformed_ddX, iwo_group);
-        args1.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-        using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-        workspace_size = search1::GetWorkspaceSize(args1);
-        bwd_algo1 = search1::Find<T>(
-            args1, false, deterministic, workspace_size,
-            ctx.template device_context<platform::CUDADeviceContext>());
-#else
-        using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo1 = search1::Find<T>(
-            args1, false, deterministic,
-            ctx.template device_context<platform::CUDADeviceContext>());
-        workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
-#endif
-      }
-
-      if (ddW) {
-        ddw = ddW->data<T>();
-        args2.handle = handle;
-        args2.idesc.set(transformed_ddO_channel, iwo_group);
-        args2.wdesc.set(*ddW, layout, iwo_group);
-        args2.odesc.set(transformed_X, iwo_group);
-        args2.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-        using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-        workspace_size =
-            std::max(workspace_size, search2::GetWorkspaceSize(args2));
-        bwd_algo2 = search2::Find<T>(
-            args2, false, deterministic, workspace_size,
-            ctx.template device_context<platform::CUDADeviceContext>());
-#else
-        using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo2 = search2::Find<T>(
-            args2, false, deterministic,
-            ctx.template device_context<platform::CUDADeviceContext>());
-        workspace_size = std::max(workspace_size,
-                                  search2::GetWorkspaceSize(args2, bwd_algo2));
-#endif
-      }
-    }
-
-    if (dW && ddX) {
-      dw = dW->data<T>();
-      args3.handle = handle;
-      args3.idesc.set(transformed_dO, iwo_group);
-      args3.wdesc.set(*dW, layout, iwo_group);
-
-      args3.odesc.set(transformed_ddX_channel, iwo_group);
-
-      args3.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-      using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search3::GetWorkspaceSize(args3));
-      filter_algo = search3::Find<T>(
-          args3, false, deterministic, workspace_size,
-          ctx.template device_context<platform::CUDADeviceContext>());
-#else
-      using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search3::Find<T>(
-          args3, false, deterministic,
-          ctx.template device_context<platform::CUDADeviceContext>());
-      workspace_size = std::max(workspace_size,
-                                search3::GetWorkspaceSize(args3, filter_algo));
-#endif
-    }
-
-    if (ddW && dX) {
-      transformed_dx = transformed_dX_channel.data<T>();
-
-      args4.handle = handle;
-      args4.idesc.set(transformed_dO, iwo_group);
-      args4.wdesc.set(*ddW, layout, iwo_group);
-      args4.odesc.set(transformed_dX_channel, iwo_group);
-      args4.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-      using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4));
-      data_algo = search4::Find<T>(
-          args4, false, deterministic, workspace_size,
-          ctx.template device_context<platform::CUDADeviceContext>());
-#else
-      using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search4::Find<T>(
-          args4, false, deterministic,
-          ctx.template device_context<platform::CUDADeviceContext>());
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
-#endif
-    }
-
-    int i_n, i_c, i_d, i_h, i_w;
-    GetNCDHW(transformed_X.dims(), platform::DataLayout::kNCHW, &i_n, &i_c,
-             &i_d, &i_h, &i_w);
-
-    int o_n, o_c, o_d, o_h, o_w;
-    GetNCDHW(transformed_dO.dims(), platform::DataLayout::kNCHW, &o_n, &o_c,
-             &o_d, &o_h, &o_w);
-
-    int group_offset_in =
-        transformed_X.numel() / transformed_X.dims()[0] / groups;
-    int group_offset_out =
-        transformed_dO.numel() / transformed_dO.dims()[0] / groups;
-    int group_offset_filter = W->numel() / groups;
-
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-
-    auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
-
-    if (ddO) {
-      if (ddX) {
-        ddx = transformed_ddX.data<T>();
-        for (int i = 0; i < groups; i++) {
-#ifdef PADDLE_WITH_HIP
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::miopenConvolutionBackwardData(
-                        handle, &alpha, args1.odesc.desc(),
-                        ddx + i * group_offset_in, args1.wdesc.desc(),
-                        w + i * group_offset_filter, args1.cdesc.desc(),
-                        bwd_algo1, &beta, args1.idesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out,
-                        workspace_ptr, workspace_size));
-              },
-              workspace_size);
-#else   // PADDLE_WITH_HIP
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionBackwardData(
-                        handle, &alpha, args1.wdesc.desc(),
-                        w + i * group_offset_filter, args1.odesc.desc(),
-                        ddx + i * group_offset_in, args1.cdesc.desc(),
-                        bwd_algo1, workspace_ptr, workspace_size, &beta,
-                        args1.idesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-#endif  // PADDLE_WITH_HIP
-        }
-      }
-      if (ddW) {
-        for (int i = 0; i < groups; i++) {
-#ifdef PADDLE_WITH_HIP
-          // MIOPEN ONLY support beta to be 0.0f
-          Tensor conv_x_ddw(dO->type());
-          conv_x_ddw.Resize(transformed_ddO_channel.dims());
-          T* conv_x_ddw_data = conv_x_ddw.mutable_data<T>(ctx.GetPlace());
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::miopenConvolutionBackwardData(
-                        handle, &alpha, args2.odesc.desc(),
-                        x + i * group_offset_in, args2.wdesc.desc(),
-                        ddw + i * group_offset_filter, args2.cdesc.desc(),
-                        bwd_algo2, &beta, args2.idesc.desc(),
-                        conv_x_ddw_data + i * group_offset_out, workspace_ptr,
-                        workspace_size));
-              },
-              workspace_size);
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
-              handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(),
-              transformed_ddy_channel + i * group_offset_out, &alpha,
-              args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta,
-              args2.idesc.desc(),
-              transformed_ddy_channel + i * group_offset_out));
-#else   // PADDLE_WITH_HIP
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionBackwardData(
-                        handle, &alpha, args2.wdesc.desc(),
-                        ddw + i * group_offset_filter, args2.odesc.desc(),
-                        x + i * group_offset_in, args2.cdesc.desc(), bwd_algo2,
-                        workspace_ptr, workspace_size, &alpha,
-                        args2.idesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-#endif  // PADDLE_WITH_HIP
-        }
-      }
-      if ((!is_sys_pad) && (!channel_last)) {
-        if (strides.size() == 2U) {
-          Slice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_ddO_channel, ddO, starts, ends, axes);
-        } else if (!is_sys_pad && strides.size() == 3U) {
-          Slice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_ddO_channel, ddO, starts, ends, axes);
-        }
-      } else if ((!is_sys_pad) && (channel_last)) {
-        if (strides.size() == 2U) {
-          Slice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_ddO_channel, &transformed_ddO_channel, starts,
-              ends, axes);
-        } else if (!is_sys_pad && strides.size() == 3U) {
-          Slice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_ddO_channel, &transformed_ddO_channel, starts,
-              ends, axes);
-        }
-
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_ddO_channel, ddO);
-      }
-    }
-
-    T* transformed_dy_channel = transformed_dO.data<T>();
-    if (dW && ddX) {
-      ddx = transformed_ddX_channel.data<T>();
-      for (int i = 0; i < groups; i++) {
-#ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardWeights(
-                      handle, &alpha, args3.odesc.desc(),
-                      ddx + i * group_offset_in, args3.idesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args3.cdesc.desc(), filter_algo, &beta,
-                      args3.wdesc.desc(), dw + i * group_offset_filter,
-                      workspace_ptr, workspace_size));
-            },
-            workspace_size);
-#else   // PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardFilter(
-                      handle, &alpha, args3.idesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args3.odesc.desc(), ddx + i * group_offset_in,
-                      args3.cdesc.desc(), filter_algo, workspace_ptr,
-                      workspace_size, &beta, args3.wdesc.desc(),
-                      dw + i * group_offset_filter));
-            },
-            workspace_size);
-#endif  // PADDLE_WITH_HIP
-      }
-    }
-
-    if (dX && ddW) {
-      ddw = ddW->data<T>();
-      for (int i = 0; i < groups; i++) {
-#ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionForward(
-                      handle, &alpha, args4.idesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args4.wdesc.desc(), ddw + i * group_offset_filter,
-                      args4.cdesc.desc(), data_algo, &beta, args4.odesc.desc(),
-                      transformed_dx + i * group_offset_in, workspace_ptr,
-                      workspace_size));
-            },
-            workspace_size);
-#else   // PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionForward(
-                      handle, &alpha, args4.idesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args4.wdesc.desc(), ddw + i * group_offset_filter,
-                      args4.cdesc.desc(), data_algo, workspace_ptr,
-                      workspace_size, &beta, args4.odesc.desc(),
-                      transformed_dx + i * group_offset_in));
-            },
-            workspace_size);
-#endif  // PADDLE_WITH_HIP
-      }
-      if (channel_last) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_dX_channel, dX);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeOpKernel<float>);
-REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeGradOpKernel<float>);
-REGISTER_OP_KERNEL(
-    conv2d_transpose_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeOpKernel<float>);
-REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeGradOpKernel<float>);
-#else
-REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeOpKernel<float>,
-                   ops::CUDNNConvTransposeOpKernel<double>);
-REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeGradOpKernel<float>,
-                   ops::CUDNNConvTransposeGradOpKernel<double>);
-REGISTER_OP_KERNEL(
-    conv2d_transpose_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvTransposeDoubleGradOpKernel<plat::float16>);
-
-REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeOpKernel<float>,
-                   ops::CUDNNConvTransposeOpKernel<double>);
-REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
-                   ops::CUDNNConvTransposeGradOpKernel<plat::float16>,
-                   ops::CUDNNConvTransposeGradOpKernel<float>,
-                   ops::CUDNNConvTransposeGradOpKernel<double>);
-#endif
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 86532664985b4..fe76fc3aebbc1 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_transpose_op.h"
-#include <memory>
+
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
-
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -29,165 +33,6 @@ namespace operators {
 
 using DataLayout = framework::DataLayout;
 
-void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ConvTranspose");
-  OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "ConvTranspose");
-  OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "ConvTranspose");
-
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  std::vector<int> output_size =
-      ctx->Attrs().Get<std::vector<int>>("output_size");
-  std::vector<int> output_padding =
-      ctx->Attrs().Get<std::vector<int>>("output_padding");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
-  int groups = ctx->Attrs().Get<int>("groups");
-  std::string padding_algorithm =
-      ctx->Attrs().Get<std::string>("padding_algorithm");
-  const std::string data_layout_str =
-      ctx->Attrs().Get<std::string>("data_format");
-  const DataLayout data_layout =
-      ctx->IsRunMKLDNNKernel() ? DataLayout::kNCHW
-                               : framework::StringToDataLayout(data_layout_str);
-
-  PADDLE_ENFORCE_EQ(in_dims.size() == 4 || in_dims.size() == 5, true,
-                    platform::errors::InvalidArgument(
-                        "Input of Op(conv_transpose) should be 4-D or "
-                        "5-D Tensor. But received: %u-D Tensor, "
-                        "the shape of input is [%s]",
-                        in_dims.size(), in_dims));
-  PADDLE_ENFORCE_EQ(
-      in_dims.size(), filter_dims.size(),
-      platform::errors::InvalidArgument(
-          "The input's dimension size and filter's dimension size of "
-          "Op (conv_transpose) should be equal. But received: the shape of "
-          "input is [%s], the dimension size of input is [%d], the shape "
-          "of filter is [%s],  the dimension size of filter is [%d]. ",
-          in_dims, in_dims.size(), filter_dims, filter_dims.size()));
-
-  int stride_size = strides.size();
-  for (int i = 0; i < stride_size; ++i) {
-    PADDLE_ENFORCE_GT(
-        strides[i], 0,
-        platform::errors::InvalidArgument(
-            "The stride of Op(Conv) should be larget than 0, but received "
-            "stride is %d.",
-            strides[i]));
-  }
-
-  int in_sub_stride_size = in_dims.size() - stride_size;
-
-  PADDLE_ENFORCE_EQ(
-      in_dims.size() - strides.size(), 2U,
-      platform::errors::InvalidArgument(
-          "The input's dimension size minus Attr(stride)'s size must "
-          "be euqal to 2 for Op(conv_transpose). But received: [%d], the "
-          "input's dimension size is [%d], the shape of input "
-          "is [%s], the Attr(stride)'s size is [%d].",
-          in_sub_stride_size, in_dims.size(), in_dims, strides.size()));
-  if (output_size.size())
-    PADDLE_ENFORCE_EQ(
-        output_size.size(), strides.size(),
-        platform::errors::InvalidArgument(
-            "The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
-            "should be the same."));
-  if (output_padding.size())
-    PADDLE_ENFORCE_EQ(
-        output_padding.size(), strides.size(),
-        platform::errors::InvalidArgument(
-            "The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
-            "should be the same."));
-
-  const int64_t C =
-      (data_layout != DataLayout::kNHWC ? in_dims[1]
-                                        : in_dims[in_dims.size() - 1]);
-  PADDLE_ENFORCE_EQ(
-      C, filter_dims[0],
-      platform::errors::InvalidArgument(
-          "The number of input channels should be equal to filter channels "
-          "for Op(conv_transpose). But received: the input's channels is "
-          "[%d], the shape of input is [%s], the filter's channels is [%d], "
-          "the shape of filter is [%s]. The data_format is %s."
-          "The error may come from wrong data_format setting.",
-          C, in_dims, filter_dims[0], filter_dims, data_layout_str));
-
-  framework::DDim in_data_dims;
-  if (data_layout != DataLayout::kNHWC) {
-    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-  } else {
-    in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-  }
-  framework::DDim filter_data_dims =
-      phi::slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-  UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                           in_data_dims, strides, ksize);
-
-  std::vector<int64_t> output_shape({in_dims[0]});
-  if (data_layout != DataLayout::kNHWC) {
-    output_shape.push_back(filter_dims[1] * groups);
-  }
-  const int offset = (data_layout != DataLayout::kNHWC ? 2 : 1);
-  for (size_t i = 0; i < strides.size(); ++i) {
-    auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-    auto infer_shape = (ctx->IsRuntime() || in_dims[i + offset] > 0)
-                           ? (in_dims[i + offset] - 1) * strides[i] -
-                                 paddings[2 * i] - paddings[2 * i + 1] +
-                                 filter_extent
-                           : -1;
-    if (output_size.size()) {
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_GE(
-            output_size[i], infer_shape,
-            platform::errors::InvalidArgument(
-                "output_size of Op(ConvTransposeOp) should not be "
-                "less than the infered output size. But received output_size = "
-                "[%s], whose dim %d is less than the infered output size [%s]",
-                phi::make_ddim(output_size).to_str(), i, infer_shape));
-        PADDLE_ENFORCE_LT(
-            output_size[i], infer_shape + strides[i],
-            platform::errors::InvalidArgument(
-                "output_size of Op(ConvTransposeOp) should be less "
-                "than infered size + stride. But received output_size = [%s], "
-                "whose dim %d is not less than the infered output size (%d) + "
-                "stride (%d) = %d",
-                phi::make_ddim(output_size).to_str(), i, infer_shape,
-                strides[i], infer_shape + strides[i]));
-      }
-      output_shape.push_back(output_size[i]);
-    } else if (output_padding.size()) {
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_GE(
-            output_padding[i], 0,
-            platform::errors::InvalidArgument(
-                "output_padding of Op(ConvTransposeOp) should not be "
-                "less than the 0. But received output_padding = "
-                "[%s], whose dim %d is less than 0",
-                phi::make_ddim(output_padding).to_str(), i));
-        PADDLE_ENFORCE_LT(
-            output_padding[i], std::max(strides[i], dilations[i]),
-            platform::errors::InvalidArgument(
-                "output_padding of Op(ConvTransposeOp) should be less "
-                "than either stride or dilation. But received output_size = "
-                "[%s], "
-                "whose dim %d is not less than either stride (%d)  or "
-                "dilation (%d)",
-                phi::make_ddim(output_size).to_str(), i, strides[i],
-                dilations[i]));
-      }
-      output_shape.push_back((infer_shape + output_padding[i]));
-    } else {
-      output_shape.push_back(infer_shape);
-    }
-  }
-  if (data_layout == DataLayout::kNHWC) {
-    output_shape.push_back(filter_dims[1] * groups);
-  }
-  ctx->SetOutputDim("Output", phi::make_ddim(output_shape));
-}
-
 framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
@@ -217,7 +62,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
 }
 
 framework::OpKernelType ConvTransposeOp::GetKernelTypeForVar(
-    const std::string& var_name, const Tensor& tensor,
+    const std::string& var_name, const framework::Tensor& tensor,
     const framework::OpKernelType& expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
   // Only input require reshaping, weights and
@@ -493,17 +338,6 @@ The input(X) size and output(Out) size may be different.
 )DOC");
 }
 
-void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  if (ctx->HasOutput(framework::GradVarName("Input"))) {
-    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-  }
-  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-  }
-}
-
 framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   bool use_cudnn =
@@ -587,24 +421,6 @@ class ConvTransposeDoubleGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-void ConvTransposeOpDoubleGrad::InferShape(
-    framework::InferShapeContext* ctx) const {
-  auto x_dims = ctx->GetInputDim("Input");
-  auto w_dims = ctx->GetInputDim("Filter");
-  auto do_dims = ctx->GetInputDim("DOutput");
-
-  if (ctx->HasOutput("DDOutput") &&
-      (ctx->HasInput("DDInput") || (ctx->HasInput("DDFilter")))) {
-    ctx->SetOutputDim("DDOutput", do_dims);
-  }
-  if (ctx->HasOutput("DFilter") && ctx->HasInput("DDInput")) {
-    ctx->SetOutputDim("DFilter", w_dims);
-  }
-  if (ctx->HasOutput("DInput") && ctx->HasInput("DDFilter")) {
-    ctx->SetOutputDim("DInput", x_dims);
-  }
-}
-
 framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   bool use_cudnn =
@@ -635,59 +451,57 @@ framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType(
 namespace ops = paddle::operators;
 
 // conv2d_transpose
+DECLARE_INFER_SHAPE_FUNCTOR(conv2d_transpose, Conv2dTranposeInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(conv2d_transpose_grad,
+                            Conv2dTranposeGradInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(
+    conv2d_transpose_grad_grad, Conv2dTranposeDoubleGradInferShapeFunctor,
+    PD_INFER_META(phi::Conv2dTransposeDoubleGradInferMeta));
+
 REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
                   ops::Conv2DTransposeOpMaker,
                   ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(
-    conv2d_transpose_grad, ops::ConvTransposeOpGrad,
-    ops::ConvTransposeDoubleGradMaker<paddle::framework::OpDesc>,
-    ops::ConvTransposeDoubleGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(conv2d_transpose_grad_grad, ops::ConvTransposeOpDoubleGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
+                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>,
+                  Conv2dTranposeInferShapeFunctor);
+REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad,
+                  ops::ConvTransposeDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::ConvTransposeDoubleGradMaker<paddle::imperative::OpBase>,
+                  Conv2dTranposeGradInferShapeFunctor);
+REGISTER_OPERATOR(conv2d_transpose_grad_grad, ops::ConvTransposeOpDoubleGrad,
+                  Conv2dTranposeDoubleGradInferShapeFunctor);
 
 // conv3d_transpose
+DECLARE_INFER_SHAPE_FUNCTOR(conv3d_transpose, Conv3dTranposeInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(conv3d_transpose_grad,
+                            Conv3dTranposeGradInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeGradInferMeta));
+
 REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
                   ops::Conv3DTransposeOpMaker,
                   ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
+                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>,
+                  Conv3dTranposeInferShapeFunctor);
+REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad,
+                  Conv3dTranposeGradInferShapeFunctor);
 
 // depthwise conv2d_transpose
+DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d_transpose,
+                            DepthWiseConv2dTranposeInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d_transpose_grad,
+                            DepthWiseConv2dTranposeGradInferShapeFunctor,
+                            PD_INFER_META(phi::ConvTransposeGradInferMeta));
+
 REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
                   ops::Conv2DTransposeOpMaker,
                   ops::ConvTransposeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
+                  ops::ConvTransposeGradOpMaker<paddle::imperative::OpBase>,
+                  DepthWiseConv2dTranposeInferShapeFunctor);
+REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad,
+                  DepthWiseConv2dTranposeGradInferShapeFunctor);
 
 REGISTER_OP_VERSION(conv_transpose)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu
deleted file mode 100644
index 054cb4b33895b..0000000000000
--- a/paddle/fluid/operators/conv_transpose_op.cu
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/phi/kernels/gpu/depthwise_conv.h"
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    int groups = context.Attr<int>("groups");
-    PADDLE_ENFORCE_EQ(
-        groups, filter.dims()[0],
-        platform::errors::InvalidArgument(
-            "groups should be error to the 1st dimension of filter. But "
-            "received groups is %d and filter dimension[0] is %d",
-            groups, filter.dims()[0]));
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    for (auto v : dilations) {
-      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
-                                  "dilations should be 1 in depthwise conv. "
-                                  "But received dilations is %d",
-                                  v));
-    }
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    output->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, output, static_cast<T>(0));
-
-    math::DepthwiseConvInputGradFunctor<phi::GPUContext, T>
-        depthwiseConvInputGrad;
-    depthwiseConvInputGrad(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *output, filter, *input, strides,
-        std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-        dilations, output, data_layout);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    if (!input_grad && !filter_grad) return;
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    if (input_grad) {
-      math::DepthwiseConvFunctor<phi::GPUContext, T> depthwiseConv;
-      depthwiseConv(
-          static_cast<const typename framework::ConvertToPhiContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          *output_grad, filter, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, input_grad, data_layout);
-    }
-
-    if (filter_grad) {
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-
-      math::DepthwiseConvFilterGradFunctor<phi::GPUContext, T>
-          depthwiseConvFilterGrad;
-      depthwiseConvFilterGrad(
-          static_cast<const typename framework::ConvertToPhiContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          *output_grad, *input, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, filter_grad, data_layout);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-// conv2d
-REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
-                        ops::GemmConvTransposeKernel<CUDA, float>,
-                        ops::GemmConvTransposeKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad,
-                        ops::GemmConvTransposeGradKernel<CUDA, float>,
-                        ops::GemmConvTransposeGradKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad_grad,
-                        ops::GemmConvTransposeGradKernel<CUDA, float>,
-                        ops::GemmConvTransposeGradKernel<CUDA, double>);
-
-// conv3d
-REGISTER_OP_CUDA_KERNEL(conv3d_transpose,
-                        ops::GemmConvTransposeKernel<CUDA, float>,
-                        ops::GemmConvTransposeKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad,
-                        ops::GemmConvTransposeGradKernel<CUDA, float>,
-                        ops::GemmConvTransposeGradKernel<CUDA, double>);
-
-// depthwise conv2d
-REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose,
-                        ops::DepthwiseConvTransposeKernel<CUDA, float>,
-                        ops::DepthwiseConvTransposeKernel<CUDA, double>);
-REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad,
-                        ops::DepthwiseConvTransposeGradKernel<CUDA, float>,
-                        ops::DepthwiseConvTransposeGradKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index ee0fb7ab36833..ac95dceb8280c 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -13,72 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/vol2col.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-template <typename DeviceContext, typename T, size_t D>
-static void Slice(const framework::ExecutionContext& context,
-                  const Tensor* input, Tensor* out,
-                  const std::vector<int64_t>& begin_vec,
-                  const std::vector<int64_t>& end_vec,
-                  const std::vector<int64_t>& axes_vec) {
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  auto in_dims = input->dims();
-  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
-  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
-  for (size_t i = 0; i < D; ++i) {
-    offsets[i] = 0;
-    extents[i] = in_dims[i];
-  }
-
-  std::vector<int64_t> out_shape_vec = phi::vectorize(in_dims);
-  for (size_t i = 0; i < axes_vec.size(); ++i) {
-    offsets[axes_vec[i]] = begin_vec[i];
-    extents[axes_vec[i]] = end_vec[i] - begin_vec[i];
-    out_shape_vec[axes_vec[i]] = end_vec[i] - begin_vec[i];
-  }
-
-  framework::DDim out_dims(phi::make_ddim(out_shape_vec));
-  out->mutable_data<T>(out_dims, context.GetPlace());
-
-  auto in_t =
-      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-          *input);
-  auto out_t =
-      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-          *out, out_dims);
-
-  EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(place, out_t, in_t,
-                                                        offsets, extents);
-  out->Resize(out_dims);
-}
-
-template <typename DeviceContext, typename T, size_t D>
-static void Slice(const framework::ExecutionContext& context,
-                  const Tensor* input, Tensor* out, int64_t begin_idx,
-                  int64_t end_idx, int64_t axes) {
-  std::vector<int64_t> begin_vec = {begin_idx};
-  std::vector<int64_t> end_vec = {end_idx};
-  std::vector<int64_t> axes_vec = {axes};
-  Slice<DeviceContext, T, D>(context, input, out, begin_vec, end_vec, axes_vec);
-}
-
 // Define Op classes in .h file so that other conv transpose
 // operator implementations can reuse the code.
 class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -94,21 +36,19 @@ class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 class ConvTransposeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override;
 
   framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
+      const std::string& var_name, const framework::Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
 class ConvTransposeOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -118,464 +58,11 @@ class ConvTransposeOpGrad : public framework::OperatorWithKernel {
 class ConvTransposeOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override;
 };
 
-template <typename DeviceContext, typename T>
-class GemmConvTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped, so it should not be constant pointer
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    int groups = context.Attr<int>("groups");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-    auto out_dims = output->dims();
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
-    // input_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
-    std::vector<int64_t> input_shape_vec = phi::vectorize(input->dims());
-    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec = phi::vectorize(filter.dims());
-
-    // use col_shape in the im2col and col2im (or vol2col and col2vol)
-    // calculation
-    // col_shape_vec: {o_c/g, k_h, k_w, h, w} or {o_c/g, k_d, k_h, k_w, d, h, w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      col_shape_vec[0] = out_dims[1] / groups;
-      for (size_t j = 0; j < data_dim; ++j) {
-        col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-        col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
-      }
-    } else {
-      col_shape_vec[0] = out_dims[out_dims.size() - 1] / groups;
-      for (size_t j = 0; j < data_dim; ++j) {
-        col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-        col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 1];
-      }
-    }
-    DDim col_shape(phi::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim + 1);
-
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-
-    // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
-    // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
-    DDim output_shape =
-        phi::slice_ddim(output->dims(), 1, output->dims().size());
-
-    // input matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
-    // input matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
-    DDim input_matrix_shape;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      input_matrix_shape = {in_dims[1], col_matrix_shape[1]};
-    } else {
-      input_matrix_shape = {col_matrix_shape[1], in_dims[in_dims.size() - 1]};
-    }
-
-    // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
-    DDim filter_matrix_shape;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      filter_matrix_shape = {in_dims[1], col_matrix_shape[0]};
-    } else {
-      filter_matrix_shape = {in_dims[in_dims.size() - 1], col_matrix_shape[0]};
-    }
-    filter.Resize(filter_matrix_shape);
-
-    output->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    set_zero(dev_ctx, output, static_cast<T>(0));
-
-    int in_step =
-        (data_layout != framework::DataLayout::kNHWC
-             ? static_cast<int>(in_dims[1]) / groups
-             : static_cast<int>(in_dims[in_dims.size() - 1]) / groups);
-
-    int out_step =
-        (data_layout != framework::DataLayout::kNHWC
-             ? static_cast<int>(out_dims[1]) / groups
-             : static_cast<int>(out_dims[out_dims.size() - 1]) / groups);
-    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-    math::Col2VolFunctor<DeviceContext, T> col2vol;
-    math::ConcatFunctor<DeviceContext, T> concat_functor;
-
-    // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
-    // on input)
-    size_t D = input->dims().size();
-    for (int i = 0; i < batch_size; i++) {
-      // batch with size (i_c, h * w) or (i_c, d * h * w) for channel_first
-      // batch with size (h * w, i_c) or (d * h * w, i_c) for channel_last
-      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-
-      // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
-      // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
-      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
-
-      std::vector<Tensor> output_batch_vec;
-      for (int g = 0; g < groups; g++) {
-        int64_t start = g * in_step;
-        int64_t end = (g + 1) * in_step;
-        int axes = (data_layout != framework::DataLayout::kNHWC ? 0 : 1);
-        Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
-        Tensor in_slice, out_slice;
-
-        // col_matrix = filter_slice * input_slice
-        // of shape (o_c/g * k_h * k_w, h * w)
-        // or (o_c/g * k_d * k_h * k_w, d * h * w)
-        if (data_layout != framework::DataLayout::kNHWC) {
-          in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
-          out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(filter_slice, true, in_slice, false, static_cast<T>(1.0),
-                      &col_matrix, static_cast<T>(0.0));
-        } else {
-          Slice<DeviceContext, T, 2>(context, &input_batch, &in_slice, start,
-                                     end, axes);
-          start = g * out_step;
-          end = (g + 1) * out_step;
-          axes = D - 2;
-          if (D == 4U) {
-            Slice<DeviceContext, T, 3>(context, &output_batch, &out_slice,
-                                       start, end, axes);
-          } else if (D == 5U) {
-            Slice<DeviceContext, T, 4>(context, &output_batch, &out_slice,
-                                       start, end, axes);
-          }
-          blas.MatMul(filter_slice, true, in_slice, true, static_cast<T>(1.0),
-                      &col_matrix, static_cast<T>(0.0));
-        }
-
-        if (data_dim == 2U) {
-          // col2im: col_matrix -> dy
-          // from (o_c/g * k_h * k_w, h * w) to (o_c/g, o_h, o_w) or (o_h, o_w,
-          // o_c/g)
-          col2im(dev_ctx, col, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                  paddings[3]},
-                 &out_slice, data_layout);
-        } else if (data_dim == 3U) {
-          // col2vol: col_matrix -> dy
-          // from (o_c/g * k_d * k_h * k_w, d * h * w) to (o_c/g, o_d, o_h, o_w)
-          // or (o_d, o_h, o_w, o_c/g)
-          col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice,
-                  data_layout);
-        }
-        if (data_layout == framework::DataLayout::kNHWC) {
-          output_batch_vec.push_back(out_slice);
-        }
-      }
-      if (data_layout == framework::DataLayout::kNHWC) {
-        concat_functor(dev_ctx, output_batch_vec, static_cast<int>(D - 2),
-                       &output_batch);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    // For filter, we do not use const pointer b/c we will do reshape,
-    // but we should avoid modifying its value.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-
-    if ((!input_grad) && (!filter_grad)) return;
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    int groups = context.Attr<int>("groups");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-    auto out_grad_dims = output_grad->dims();
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-
-    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
-    // input_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
-    std::vector<int64_t> input_shape_vec = phi::vectorize(input->dims());
-    // filter_shape_vec: {i_c, o_c, k_h, k_w} or {i_c, o_c, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec = phi::vectorize(filter.dims());
-
-    // use col_shape in the im2col and col2im (or vol2col and col2vol)
-    // calculation
-    // col_shape_vec: {o_c, k_h, k_w, h, w} or {o_c, k_d, k_h, k_w, d, h, w} for
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      col_shape_vec[0] = out_grad_dims[1];
-      for (size_t j = 0; j < data_dim; ++j) {
-        col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-        col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
-      }
-    } else {
-      col_shape_vec[0] = out_grad_dims[out_grad_dims.size() - 1];
-      for (size_t j = 0; j < data_dim; ++j) {
-        col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-        col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 1];
-      }
-    }
-    DDim col_shape(phi::make_ddim(col_shape_vec));
-
-    // use col_matrix_shape in the gemm calculation
-    // size: (o_c * k_h * k_w, h * w) or (o_c * k_d * k_h * k_w, d * h * w)
-    DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim + 1);
-
-    // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
-    // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
-    DDim output_shape =
-        phi::slice_ddim(output_grad->dims(), 1, output_grad->dims().size());
-
-    // input matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
-    // input matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
-    DDim input_matrix_shape;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      input_matrix_shape = {in_dims[1], col_matrix_shape[1]};
-    } else {
-      input_matrix_shape = {col_matrix_shape[1], in_dims[in_dims.size() - 1]};
-    }
-
-    // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
-    DDim filter_matrix_shape;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      filter_matrix_shape = {in_dims[1], col_matrix_shape[0] / groups};
-    } else {
-      filter_matrix_shape = {in_dims[in_dims.size() - 1],
-                             col_matrix_shape[0] / groups};
-    }
-    filter.Resize(filter_matrix_shape);
-
-    int in_step =
-        (data_layout != framework::DataLayout::kNHWC
-             ? static_cast<int>(in_dims[1]) / groups
-             : static_cast<int>(in_dims[in_dims.size() - 1]) / groups);
-    int col_step = static_cast<int>(col_matrix_shape[0]) / groups;
-
-    // convolution transpose grad on input:
-    // im2col + gemm (similar to conv-forward)
-    // input need to compute gradient
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    if (input_grad || filter_grad) {
-      Tensor col;
-      col.mutable_data<T>(col_shape, context.GetPlace());
-      // col_matrix shares the same piece of data with col,
-      // but will be reshaped into a two-dimensional matrix shape
-      // to call the matrix multiplication interface.
-      Tensor col_matrix;
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-
-      Tensor filter_grad_;
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      math::ConcatFunctor<DeviceContext, T> concat_functor;
-
-      if (input_grad) {
-        input_grad->mutable_data<T>(context.GetPlace());
-        set_zero(dev_ctx, input_grad, static_cast<T>(0));
-      }
-      if (filter_grad) {  // filter_grad_ size (i_c, o_c/g, k_h, k_w)
-        filter_grad->mutable_data<T>(context.GetPlace());
-        set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-        filter_grad_ = *filter_grad;
-        filter_grad_.Resize(filter_matrix_shape);
-      }
-
-      size_t D = input->dims().size();
-      for (int i = 0; i < batch_size; i++) {
-        // batch with size (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for
-        // channel_first
-        // batch with size (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for
-        // channel_last
-        Tensor output_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_shape);
-
-        if (data_dim == 2U) {
-          // im2col: dy -> col matrix
-          // from (o_c, o_h, o_w) to (o_c * k_h * k_w, i_h * i_w) for
-          // channel_first
-          // from (o_h, o_w, o_c) to (o_c * k_h * k_w, i_h * i_w) for
-          // channel_last
-          im2col(dev_ctx, output_grad_batch, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                  paddings[3]},
-                 &col, data_layout);
-        } else if (data_dim == 3U) {
-          // vol2col: dy -> col_matrix
-          // from (o_c, o_d, o_h, o_w) to (o_c * k_d * k_h * k_w, i_d * i_h *
-          // i_w) for channel_first
-          // from (o_d, o_h, o_w, o_c) to (i_d * i_h * i_w, o_c * k_d * k_h *
-          // k_w) for channel_last
-          vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings,
-                  &col, data_layout);
-        }
-
-        if (input_grad) {
-          // batch with size (i_c, i_h, i_w) or (i_h, i_w, i_c)
-          Tensor input_grad_batch =
-              input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
-
-          // gemm: dx = filter * dy
-          // (i_c, o_c * k_h * k_w) * (o_c * k_h * k_w, i_h * i_w) -> (i_c, i_h
-          // * i_w)
-          // or
-          // (i_c, o_c * k_d * k_h * k_w) * (o_c * k_d * k_h * k_w, i_d * i_h *
-          // i_w) -> (i_c,
-          // i_d, i_h, i_w)
-          // gemm: dx = dy^T * filter^T for channel_last
-
-          std::vector<Tensor> input_grad_batch_vec;
-          for (int g = 0; g < groups; g++) {
-            // input_grad_slice: (i_c/g, i_h * i_w) or (i_c/g, i_d * i_h * i_w)
-            // for channel_first
-            // input_grad_slice: (i_h * i_w, i_c/g) or (i_d * i_h * i_w, i_c/g)
-            // for channel_last
-            // filter_slice: (i_c/g, o_c/g * k_h * k_w)
-            Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
-            // col_matrix_slice: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d *
-            // k_h * k_w, d * h * w)
-            Tensor col_matrix_slice =
-                col_matrix.Slice(g * col_step, (g + 1) * col_step);
-            if (data_layout != framework::DataLayout::kNHWC) {
-              Tensor input_grad_slice =
-                  input_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-              blas.MatMul(filter_slice, false, col_matrix_slice, false,
-                          static_cast<T>(1.0), &input_grad_slice,
-                          static_cast<T>(0.0));
-            } else {
-              Tensor input_grad_slice;
-              Slice<DeviceContext, T, 2>(context, &input_grad_batch,
-                                         &input_grad_slice, g * in_step,
-                                         (g + 1) * in_step, 1);
-              blas.MatMul(col_matrix_slice, true, filter_slice, true,
-                          static_cast<T>(1.0), &input_grad_slice,
-                          static_cast<T>(0.0));
-              DDim input_grad_slice_shape;
-              if (data_dim == 2U) {
-                input_grad_slice_shape = {in_dims[1], in_dims[2], in_step};
-              } else {
-                input_grad_slice_shape = {in_dims[1], in_dims[2], in_dims[3],
-                                          in_step};
-              }
-              input_grad_slice =
-                  input_grad_slice.Resize(input_grad_slice_shape);
-              input_grad_batch_vec.push_back(input_grad_slice);
-            }
-          }
-          if (data_layout == framework::DataLayout::kNHWC) {
-            concat_functor(dev_ctx, input_grad_batch_vec,
-                           static_cast<int>(D - 2), &input_grad_batch);
-          }
-        }
-        if (filter_grad) {
-          // input batch: (i_c, i_h * i_w) or (i_h, i_w * i_c)
-          Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-          // gemm: d_filter = x * dy^T
-          // (i_c, i_h * i_w) * (i_h * i_w, o_c * k_h * k_w) -> (i_c, o_c * k_h
-          // * k_w)
-          // or
-          // (i_c, i_d * i_h * i_w) * (i_d * i_h * i_w, o_c * k_d * k_h * k_w)
-          // -> (i_c, o_c * k_d *
-          // k_h * k_w)
-          // gemm: d_filter = x^T * dy^T for channel_last
-
-          for (int g = 0; g < groups; g++) {
-            Tensor filter_grad_slice =
-                filter_grad_.Slice(g * in_step, (g + 1) * in_step);
-            Tensor col_matrix_slice =
-                col_matrix.Slice(g * col_step, (g + 1) * col_step);
-            if (data_layout != framework::DataLayout::kNHWC) {
-              Tensor in_batch_slice =
-                  in_batch.Slice(g * in_step, (g + 1) * in_step);
-              blas.MatMul(in_batch_slice, false, col_matrix_slice, true,
-                          static_cast<T>(1.0), &filter_grad_slice,
-                          static_cast<T>(1.0));
-            } else {
-              Tensor in_batch_slice;
-              Slice<DeviceContext, T, 2>(context, &in_batch, &in_batch_slice,
-                                         g * in_step, (g + 1) * in_step, 1);
-              blas.MatMul(in_batch_slice, true, col_matrix_slice, true,
-                          static_cast<T>(1.0), &filter_grad_slice,
-                          static_cast<T>(1.0));
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc
index 7d0ebf21829c2..050ede78f72cf 100644
--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_npu.cc
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_transpose_op.h"
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -55,8 +59,8 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
     filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
 
     std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm,
-                             in_data_dims, stride, ksize);
+    phi::UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm,
+                                  in_data_dims, stride, ksize);
 
     // construct NPU attr
     std::vector<int> strides(4, 1);
@@ -137,8 +141,8 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
     framework::DDim filter_data_dims =
         phi::slice_ddim(filter_dims, 2, filter_dims.size());
     std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
+    phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                                  in_data_dims, strides, ksize);
 
     std::vector<int> strides_vec(4, 1);
     std::vector<int> dilations_vec(4, 1);
diff --git a/paddle/fluid/operators/conv_transpose_op_xpu.cc b/paddle/fluid/operators/conv_transpose_op_xpu.cc
index 12e1739f2a267..b8bd3c4f00608 100644
--- a/paddle/fluid/operators/conv_transpose_op_xpu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_xpu.cc
@@ -8,15 +8,22 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #include "paddle/fluid/operators/conv_transpose_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 // target_len == 2 || target_len == 4
 inline std::vector<int> vector_extend(const std::vector<int>& src,
                                       int target_len) {
@@ -61,8 +68,8 @@ class Conv2DTransposeXPUKernel : public framework::OpKernel<T> {
     framework::DDim filter_data_dims =
         phi::slice_ddim(filter.dims(), 2, filter.dims().size());
     std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
+    phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                                  in_data_dims, strides, ksize);
 
     const int batch_size = static_cast<int>(input->dims()[0]);
     const int img_yc = static_cast<int>(input->dims()[1]);
@@ -135,8 +142,8 @@ class Conv2DTransposeGradXPUKernel : public framework::OpKernel<T> {
     framework::DDim filter_data_dims =
         phi::slice_ddim(filter.dims(), 2, filter.dims().size());
     std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
+    phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                                  in_data_dims, strides, ksize);
 
     const int batch_size = static_cast<int>(input->dims()[0]);
     const int img_yc = static_cast<int>(input->dims()[1]);
diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc
index bff6673429d9a..889cdac8f6882 100644
--- a/paddle/fluid/operators/cumprod_op.cc
+++ b/paddle/fluid/operators/cumprod_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/cumprod_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,14 +23,6 @@ namespace operators {
 class CumprodOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cumprod");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cumprod");
-
-    ctx->ShareDim("X", "Out");
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 class CumprodOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -81,22 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(cumprod, CumprodInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 
 REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker,
                   ops::CumprodGradOpMaker<paddle::framework::OpDesc>,
-                  ops::CumprodGradOpMaker<paddle::imperative::OpBase>);
+                  ops::CumprodGradOpMaker<paddle::imperative::OpBase>,
+                  CumprodInferShapeFunctor);
 
 REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    cumprod, ops::CumprodOpCPUKernel<float>, ops::CumprodOpCPUKernel<double>,
-    ops::CumprodOpCPUKernel<int>, ops::CumprodOpCPUKernel<int64_t>,
-    ops::CumprodOpCPUKernel<paddle::platform::complex<float>>,
-    ops::CumprodOpCPUKernel<paddle::platform::complex<double>>);
-
-REGISTER_OP_CPU_KERNEL(
-    cumprod_grad, ops::CumprodGradOpCPUKernel<float>,
-    ops::CumprodGradOpCPUKernel<double>, ops::CumprodGradOpCPUKernel<int>,
-    ops::CumprodGradOpCPUKernel<int64_t>,
-    ops::CumprodGradOpCPUKernel<paddle::platform::complex<float>>,
-    ops::CumprodGradOpCPUKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu
deleted file mode 100644
index f792d6832917f..0000000000000
--- a/paddle/fluid/operators/cumprod_op.cu
+++ /dev/null
@@ -1,369 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/transform.h>
-#include "paddle/fluid/operators/cumprod_op.h"
-#include "paddle/fluid/operators/math/inclusive_scan.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct MultiplyFunctor {
-  HOSTDEVICE T operator()(T a, T b) const { return a * b; }
-};
-
-template <typename T>
-class CumprodOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<framework::Tensor>("X");
-    auto *y = ctx.Output<framework::Tensor>("Out");
-    auto dim = ctx.Attr<int>("dim");
-    size_t outer_dim, mid_dim, inner_dim;
-    GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
-
-    const auto *x_data = x->data<T>();
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    const auto &dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    math::InclusiveScan<T, MultiplyFunctor<T>>(
-        x_data, y_data, outer_dim, mid_dim, inner_dim, static_cast<T>(1),
-        MultiplyFunctor<T>(), /*reverse=*/false, dev_ctx);
-  }
-};
-
-template <typename T>
-struct IsZeroFunctor {
-  HOSTDEVICE bool operator()(T x) const { return x == static_cast<T>(0); }
-};
-
-template <typename T>
-struct CumprodGradFunctorExceptFirstZero {
-  HOSTDEVICE CumprodGradFunctorExceptFirstZero(
-      const T *x, const T *y, const T *dy_mul_y_reversed_cumsum,
-      const uint8_t *zero_mask, size_t mid_dim, size_t inner_dim, T *dx,
-      int64_t *first_zero_idx, T *x_filled_one)
-      : x_(x),
-        y_(y),
-        dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum),
-        zero_mask_(zero_mask),
-        mid_dim_(mid_dim),
-        inner_dim_(inner_dim),
-        dx_(dx),
-        first_zero_idx_(first_zero_idx),
-        x_filled_one_(x_filled_one) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    auto inner_idx = idx % inner_dim_;
-    auto outer_idx = idx / (mid_dim_ * inner_dim_);
-    auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_;
-    auto mask = zero_mask_[idx];
-    bool should_fill_one = true;
-
-    if (mask == 0) {
-      dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx];
-      if (mid_idx == mid_dim_ - 1) {
-        // record first zero position as -1, i.e., no zero
-        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1;
-      }
-    } else if (mid_idx > 0) {                  // mask > 0
-      if (zero_mask_[idx - inner_dim_] > 0) {  // not first zero
-        dx_[idx] = 0;
-        should_fill_one = false;
-      } else {
-        // idx is the first zero position, it should be recorded
-        dx_[idx] = y_[idx - inner_dim_];
-        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx;
-      }
-    } else {  // the first zero position is index 0
-      dx_[idx] = 1;
-      first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0;
-    }
-
-    x_filled_one_[idx] = should_fill_one ? 1 : x_[idx];
-  }
-
- private:
-  const T *x_;
-  const T *y_;
-  const T *dy_mul_y_reversed_cumsum_;
-  const uint8_t *zero_mask_;
-  size_t mid_dim_;
-  size_t inner_dim_;
-  T *dx_;
-  int64_t *first_zero_idx_;
-  T *x_filled_one_;
-};
-
-template <typename T>
-struct FillFirstZeroPositionGradFunctor {
-  HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx,
-                                              const T *grad_value,
-                                              size_t mid_dim, size_t inner_dim,
-                                              T *dx)
-      : first_zero_idx_(first_zero_idx),
-        grad_value_(grad_value),
-        mid_dim_(mid_dim),
-        inner_dim_(inner_dim),
-        dx_(dx) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    auto outer_idx = idx / inner_dim_;
-    auto inner_idx = idx % inner_dim_;
-    auto mid_idx = first_zero_idx_[idx];
-    if (mid_idx >= 0) {
-      auto full_idx =
-          outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx;
-      dx_[full_idx] *= grad_value_[full_idx];
-    }
-  }
-
- private:
-  const int64_t *first_zero_idx_;
-  const T *grad_value_;
-  size_t mid_dim_;
-  size_t inner_dim_;
-  T *dx_;
-};
-
-/*
-Reference to
-https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp
-input: x, y, dL/dy
-output: dL/dx
-dL/dx[i] = sum{0<=j<n} (dL/dy[j])*(dy[j]/dx[i]) (1)
-         = sum(0<=j<n} (dL/dy[j])*(d(x[0]*x[1]*...*x[j])/dx[i])
-if x[i] != 0, dL/dx[i] = sum{i<=j<n} (dL/dy[j])*(y[j]/x[i]) (2)
-if x[i] == 0, the formula(2) can not be applied directly.
-Suppose k is the first index of zero element, the formula will be:
-i > k, dL/dx[i] = 0;
-i < k, dL/dx[i] = 1/x[i]*sum{i<=j<n} (dL/dy[j]*y[j])
-i = k, dL/dx[i] = y[i-1]*sum{i<=j<n} (dL/dy[j])*(x[i+1]*...*x[j])
-
-First, we will show the main resolution.
-We need to judge the relationship between i (current index) and k (index
-which corresponds to the first element of 0).
-To mark the relationship, we now introduce zero_mask and we also need to
-mark the index of the first zero element.
-zero_mask = cummax(x[i] == 0);      //label whether x[i]==0 until the index.
-zero_index = -1;                    //store the first zero element's index.
-e.g. x = [1, 4, 5, 0, 2, 3, 0];
-     zero_mask = [0, 0, 0, 1, 1, 1, 1];
-     zero_index = 3;
-When i < k, we need to calculate the result of sum{i<=j<n}(d_y[j]*y[j]), we can
-use reversed cumsum to calculate it.
-R = reversed_cumsum(dy[j]*y[j]);     //store the calculation result of the
-sum{i<=j<n}(d_y[j]*y[j]) and x[k+1],x[k+2],...,x[j] along the index k+1 ~ j.
-When i = k, we need to calculate the result of prod{i<w<j}(x[w]).
-To calculate it, we introduce x_filled_one, which fill 1 before x[k+1] along
-the index 0 ~ k.
-e.g. x = [1, 4, 5, 0, 2, 3, 0];
-     x_filled_one = [1, 1, 1, 1, 2, 3, 0];
-Thus, we can use cumprod(x_filled_one[j]) to calculate the result of
-prod{k<=w<j}(x[w]).
-
-Then, we will show more detailed implementation.
-for (int i = 0; i < numel; i++) {
-    if (zero_mask[i] == 0) {       //case i < k
-        dx[i] = R[i] / x[i];
-        x_filled_one[i] = 1;
-    } else {
-        if (i == 0) {              //case i = k
-            dx[i] = 1;
-            zero_index = i;
-            x_filled_one[i] = 1;
-        } else {
-            if (zero_mask[i-1] == 0) {    //case i = k
-                dx[i] = y[i-1];
-                zero_index = i;
-                x_filled_one[i] = 1;
-            } else {                  //case i > k
-                dx[i] = 0;
-                x_filled_one[i] = x[i];
-            }
-        }
-    }
-}
-T = reversed_cumsum(dy[j]*cumprod(x_filled_one[j]));
-if (zero_index != -1) {
-    dx[zero_index] *= T[zero_index];
-}
-*/
-
-template <typename T>
-class CumprodGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<framework::Tensor>("X");
-    const auto *y = ctx.Input<framework::Tensor>("Out");
-    const auto *dy =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto dim = ctx.Attr<int>("dim");
-
-    size_t outer_dim, mid_dim, inner_dim;
-    GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
-    if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
-
-    size_t numel = outer_dim * mid_dim * inner_dim;
-
-    const auto *x_data = x->data<T>();
-    const auto *y_data = y->data<T>();
-    const auto *dy_data = dy->data<T>();
-
-    auto place = ctx.GetPlace();
-    const auto &dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    auto *dx_data = dx->mutable_data<T>(place);
-
-    // deal with complex
-    const T *x_data_deal;
-    const T *y_data_deal;
-    memory::AllocationPtr x_conj;
-    memory::AllocationPtr y_conj;
-    if (framework::IsComplex<T>::value) {
-      x_conj = memory::Alloc(place, numel * sizeof(T));
-      auto *x_data_conj = reinterpret_cast<T *>(x_conj->ptr());
-      y_conj = memory::Alloc(place, numel * sizeof(T));
-      auto *y_data_conj = reinterpret_cast<T *>(y_conj->ptr());
-
-      platform::ForRange<platform::CUDADeviceContext> for_range_x(dev_ctx,
-                                                                  numel);
-      phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
-      for_range_x(functor_x);
-
-      platform::ForRange<platform::CUDADeviceContext> for_range_y(dev_ctx,
-                                                                  numel);
-      phi::funcs::ConjFunctor<T> functor_y(y_data, numel, y_data_conj);
-      for_range_y(functor_y);
-      x_data_deal = x_data_conj;
-      y_data_deal = y_data_conj;
-    } else {
-      x_data_deal = x_data;
-      y_data_deal = y_data;
-    }
-
-// Step 1: find cummax-ed zero mask of x
-#ifdef PADDLE_WITH_CUDA
-    const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
-#else
-    const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
-#endif
-    auto zero_mask_without_cummax =
-        memory::Alloc(place, numel * sizeof(uint8_t));
-    auto *zero_mask_without_cummax_data =
-        reinterpret_cast<uint8_t *>(zero_mask_without_cummax->ptr());
-    thrust::transform(
-        exec_policy, thrust::device_pointer_cast(x_data_deal),
-        thrust::device_pointer_cast(x_data_deal) + numel,
-        thrust::device_pointer_cast(zero_mask_without_cummax_data),
-        IsZeroFunctor<T>());
-
-    auto zero_mask = memory::Alloc(place, numel * sizeof(uint8_t));
-    auto *zero_mask_data = reinterpret_cast<uint8_t *>(zero_mask->ptr());
-    math::InclusiveScan<uint8_t, cub::Max>(
-        zero_mask_without_cummax_data, zero_mask_data, outer_dim, mid_dim,
-        inner_dim, static_cast<uint8_t>(0), cub::Max(), /*reverse=*/false,
-        dev_ctx);
-    zero_mask_without_cummax = nullptr;
-
-    // Step 2: calculate reversed cumsum(dy * y)
-    auto dy_mul_y = memory::Alloc(place, numel * sizeof(T));
-    auto *dy_mul_y_data = reinterpret_cast<T *>(dy_mul_y->ptr());
-    thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data),
-                      thrust::device_pointer_cast(dy_data) + numel,
-                      thrust::device_pointer_cast(y_data_deal),
-                      thrust::device_pointer_cast(dy_mul_y_data),
-                      MultiplyFunctor<T>());
-
-    auto dy_mul_y_reversed_cumsum = memory::Alloc(place, numel * sizeof(T));
-    auto *dy_mul_y_reversed_cumsum_data =
-        reinterpret_cast<T *>(dy_mul_y_reversed_cumsum->ptr());
-    math::InclusiveScan<T, cub::Sum>(
-        dy_mul_y_data, dy_mul_y_reversed_cumsum_data, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(0), cub::Sum(), /*reverse=*/true, dev_ctx);
-
-    // Step 3: calculate the gradient value except the first zero position.
-    // The gradient value of the first zero position is filled with out[idx-1],
-    // while the gradient value of the other positions are calculated out
-    // completely. This functor also:
-    //  (1) find the first zero index, i.e., first_zero_idx_data.
-    //  (2) fill x_filled_one, which satifies
-    //      x_filled_one[i] = x[i], i > pos
-    //      x_filled_one[i] = 1, i <= pos
-    auto first_zero_idx =
-        memory::Alloc(place, outer_dim * inner_dim * sizeof(int64_t));
-    auto *first_zero_idx_data =
-        reinterpret_cast<int64_t *>(first_zero_idx->ptr());
-    auto *x_filled_one_data = dy_mul_y_data;  // reuse former allocated memory
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, numel);
-    CumprodGradFunctorExceptFirstZero<T> functor_except_first_zero(
-        x_data_deal, y_data_deal, dy_mul_y_reversed_cumsum_data, zero_mask_data,
-        mid_dim, inner_dim, dx_data, first_zero_idx_data, x_filled_one_data);
-    for_range(functor_except_first_zero);
-
-    // Step 4: calculate cumprod of x_filled_one
-    auto *x_filled_one_cumprod_data =
-        dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
-    math::InclusiveScan<T, MultiplyFunctor<T>>(
-        x_filled_one_data, x_filled_one_cumprod_data, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(1), MultiplyFunctor<T>(), /*reverse=*/false,
-        dev_ctx);
-
-    // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod)
-    auto *dy_mul_x_filled_one_cumprod =
-        dy_mul_y_data;  // reuse former allocated memory
-    thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data),
-                      thrust::device_pointer_cast(dy_data) + numel,
-                      thrust::device_pointer_cast(x_filled_one_cumprod_data),
-                      thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod),
-                      MultiplyFunctor<T>());
-    auto *dy_mul_x_filled_one_cumprod_reversed_cumsum =
-        dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
-    math::InclusiveScan<T, cub::Sum>(
-        dy_mul_x_filled_one_cumprod,
-        dy_mul_x_filled_one_cumprod_reversed_cumsum, outer_dim, mid_dim,
-        inner_dim, static_cast<T>(0), cub::Sum(),
-        /*reverse=*/true, dev_ctx);
-
-    // Step 6: fill zero pos gradient value
-    platform::ForRange<platform::CUDADeviceContext>
-        for_range_fill_zero_pos_grad(dev_ctx, outer_dim * inner_dim);
-    FillFirstZeroPositionGradFunctor<T> fill_first_zero_pos_grad_functor(
-        first_zero_idx_data, dy_mul_x_filled_one_cumprod_reversed_cumsum,
-        mid_dim, inner_dim, dx_data);
-    for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    cumprod, ops::CumprodOpCUDAKernel<float>, ops::CumprodOpCUDAKernel<double>,
-    ops::CumprodOpCUDAKernel<int>, ops::CumprodOpCUDAKernel<int64_t>,
-    ops::CumprodOpCUDAKernel<paddle::platform::complex<float>>,
-    ops::CumprodOpCUDAKernel<paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    cumprod_grad, ops::CumprodGradOpCUDAKernel<float>,
-    ops::CumprodGradOpCUDAKernel<double>, ops::CumprodGradOpCUDAKernel<int>,
-    ops::CumprodGradOpCUDAKernel<int64_t>,
-    ops::CumprodGradOpCUDAKernel<paddle::platform::complex<float>>,
-    ops::CumprodGradOpCUDAKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h
deleted file mode 100644
index 74ed2008ae983..0000000000000
--- a/paddle/fluid/operators/cumprod_op.h
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-static void GetCumprodDimInfo(const framework::DDim& dim, int cumprod_dim,
-                              size_t* outer_dim, size_t* mid_dim,
-                              size_t* inner_dim) {
-  PADDLE_ENFORCE_GE(
-      cumprod_dim, -dim.size(),
-      platform::errors::InvalidArgument(
-          "The input dim of CumprodOp should be larger than the opposite "
-          "rank of input x which is %d.But received dim=%d",
-          -dim.size(), cumprod_dim));
-  PADDLE_ENFORCE_LT(cumprod_dim, dim.size(),
-                    platform::errors::InvalidArgument(
-                        "The input dim of CumprodOp should be smaller than the "
-                        "rank of input x which is %d.But received dim=%d",
-                        dim.size(), cumprod_dim));
-  if (cumprod_dim < 0) cumprod_dim += dim.size();
-
-  *outer_dim = 1;
-  for (int i = 0; i < cumprod_dim; ++i) {
-    *outer_dim *= dim[i];
-  }
-  *mid_dim = dim[cumprod_dim];
-  *inner_dim = 1;
-  for (int i = cumprod_dim + 1; i < dim.size(); ++i) {
-    *inner_dim *= dim[i];
-  }
-}
-
-template <typename T>
-class CumprodOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    int dim = context.Attr<int>("dim");
-
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    framework::DDim shape = x->dims();
-
-    size_t outer_dim = 1;
-    size_t mid_dim = 1;
-    size_t inner_dim = 1;
-    GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
-
-    for (size_t i = 0; i < outer_dim; i++) {
-      for (size_t j = 0; j < mid_dim; j++) {
-        for (size_t k = 0; k < inner_dim; k++) {
-          size_t pos = i * mid_dim * inner_dim + j * inner_dim + k;
-          if (j == 0) {
-            out_data[pos] = x_data[pos];
-          } else {
-            out_data[pos] = out_data[pos - inner_dim] * x_data[pos];
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class CumprodGradOpCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const {
-    const Tensor* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* out = context.Input<Tensor>("Out");
-
-    int dim = context.Attr<int>("dim");
-    framework::DDim shape = x->dims();
-    Tensor* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-
-    auto* d_out_data = d_out->data<T>();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->data<T>();
-    auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-
-    auto place = context.GetPlace();
-    const auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-
-    size_t outer_dim = 1;
-    size_t mid_dim = 1;
-    size_t inner_dim = 1;
-    GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
-    size_t numel = outer_dim * mid_dim * inner_dim;
-
-    // deal with complex
-    const T* x_data_deal;
-    const T* out_data_deal;
-    memory::AllocationPtr x_conj;
-    memory::AllocationPtr out_conj;
-    if (framework::IsComplex<T>::value) {
-      x_conj = memory::Alloc(place, numel * sizeof(T));
-      auto* x_data_conj = reinterpret_cast<T*>(x_conj->ptr());
-      out_conj = memory::Alloc(place, numel * sizeof(T));
-      auto* out_data_conj = reinterpret_cast<T*>(out_conj->ptr());
-
-      platform::ForRange<platform::CPUDeviceContext> for_range_x(dev_ctx,
-                                                                 numel);
-      phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
-      for_range_x(functor_x);
-
-      platform::ForRange<platform::CPUDeviceContext> for_range_out(dev_ctx,
-                                                                   numel);
-      phi::funcs::ConjFunctor<T> functor_out(out_data, numel, out_data_conj);
-      for_range_out(functor_out);
-
-      x_data_deal = x_data_conj;
-      out_data_deal = out_data_conj;
-    } else {
-      x_data_deal = x_data;
-      out_data_deal = out_data;
-    }
-
-    for (size_t i = 0; i < outer_dim; i++) {
-      for (size_t k = 0; k < inner_dim; k++) {
-        for (size_t j = 0; j < mid_dim; j++) {
-          size_t index = i * mid_dim * inner_dim + j * inner_dim + k;
-          d_x_data[index] = 0;
-          for (size_t n = 0; n < mid_dim; n++) {
-            size_t pos = i * mid_dim * inner_dim + n * inner_dim + k;
-            T elem;
-            if (j == 0) {
-              elem = d_out_data[pos];
-            } else {
-              elem = d_out_data[pos] * out_data_deal[index - inner_dim];
-            }
-            if (pos > index) {
-              for (size_t m = index + inner_dim; m <= pos; m += inner_dim) {
-                elem *= x_data_deal[m];
-              }
-            } else if (pos < index) {
-              elem = static_cast<T>(0);
-            }
-            d_x_data[index] += elem;
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc
index b15efc5f84bdd..6e15fd090b8c4 100644
--- a/paddle/fluid/operators/deformable_conv_op.cc
+++ b/paddle/fluid/operators/deformable_conv_op.cc
@@ -338,8 +338,6 @@ REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp,
 
 REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp);
 
-REGISTER_OP_CPU_KERNEL(deformable_conv, ops::DeformableConvCPUKernel<float>,
-                       ops::DeformableConvCPUKernel<double>);
 REGISTER_OP_CPU_KERNEL(deformable_conv_grad,
                        ops::DeformableConvGradCPUKernel<float>,
                        ops::DeformableConvGradCPUKernel<double>);
diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu
index 2c7d905c79b37..ad10abf9c647b 100644
--- a/paddle/fluid/operators/deformable_conv_op.cu
+++ b/paddle/fluid/operators/deformable_conv_op.cu
@@ -446,108 +446,6 @@ __global__ void FilterGradAddupGpuKernel(const int nthreads, const int n,
   }
 }
 
-template <typename DeviceContext, typename T>
-class DeformableConvCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor offset = *ctx.Input<Tensor>("Offset");
-    const Tensor mask = *ctx.Input<Tensor>("Mask");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.cuda_device_context();
-
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(phi::vectorize(output->dims()));
-
-    // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, DeviceContext>(output_shape, dev_ctx);
-
-    int64_t M = output_shape_vec[1] / groups;
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K =
-        input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer)
-        .Resize(phi::make_ddim({groups, K, N}));
-    Tensor output_4d;
-    output_4d.ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
-    output_4d.mutable_data<T>(ctx.GetPlace());
-    framework::DDim input_shape =
-        phi::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
-
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset.numel() / offset.dims()[0];
-    int input_mask_dim = mask.numel() / mask.dims()[0];
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    const T* input_ptr = input->data<T>();
-    const T* offset_ptr = offset.data<T>();
-    const T* mask_ptr = mask.data<T>();
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    T* col_buffer_ptr = col_buffer.data<T>();
-
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      ModulatedDeformableIm2col(
-          ctx.device_context(), input_ptr + i * im2col_step * input_dim,
-          offset_ptr + i * im2col_step * input_offset_dim,
-          mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-          col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
-          deformable_groups, col_buffer_ptr);
-
-      Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
-          phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-        Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
-
-        blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
-                    &output_3d_slice, T(0.0));
-      }
-    }
-    output->ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim(output_shape_vec));
-  }
-};
-
 template <typename DeviceContext, typename T>
 class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -740,9 +638,6 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 
-REGISTER_OP_CUDA_KERNEL(deformable_conv,
-                        ops::DeformableConvCUDAKernel<CUDA, float>,
-                        ops::DeformableConvCUDAKernel<CUDA, double>);
 REGISTER_OP_CUDA_KERNEL(deformable_conv_grad,
                         ops::DeformableConvGradCUDAKernel<CUDA, float>,
                         ops::DeformableConvGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/deformable_conv_op.h b/paddle/fluid/operators/deformable_conv_op.h
index 66961655ee6ff..1176b96987ed6 100644
--- a/paddle/fluid/operators/deformable_conv_op.h
+++ b/paddle/fluid/operators/deformable_conv_op.h
@@ -318,102 +318,6 @@ void FilterGradAddupCPUKernel(const int nthreads, const int n, const int height,
   }
 }
 
-template <typename T>
-class DeformableConvCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* mask = ctx.Input<Tensor>("Mask");
-    Tensor filter = *ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<CPUDeviceContext>();
-
-    const int groups = ctx.Attr<int>("groups");
-    const int deformable_groups = ctx.Attr<int>("deformable_groups");
-    const int im2col_step = ctx.Attr<int>("im2col_step");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-    std::vector<int64_t> output_shape_vec(phi::vectorize(output->dims()));
-
-    // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
-    std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-    col_buffer_shape_vec[0] =
-        input->dims()[1] * filter.dims()[2] * filter.dims()[3];
-    col_buffer_shape_vec[1] = im2col_step;
-    for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-      col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec));
-    std::vector<int64_t> output_buffer_shape_vec(1);
-    output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                                 output_shape_vec[2] * output_shape_vec[3];
-    framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec));
-    Tensor col_buffer;
-    Tensor output_buffer;
-    col_buffer = ctx.AllocateTmpTensor<T, CPUDeviceContext>(col_shape, dev_ctx);
-    output_buffer =
-        ctx.AllocateTmpTensor<T, CPUDeviceContext>(output_shape, dev_ctx);
-    int64_t M = output_shape_vec[1] / groups;
-    int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-    int64_t K =
-        input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
-
-    Tensor weight_3d;
-    weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
-    Tensor col_buffer_3d;
-    col_buffer_3d.ShareDataWith(col_buffer)
-        .Resize(phi::make_ddim({groups, K, N}));
-    Tensor output_4d;
-    output_4d.ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
-    output_4d.mutable_data<T>(ctx.GetPlace());
-    framework::DDim input_shape =
-        phi::slice_ddim(input->dims(), 1, input->dims().size());
-    std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
-    int input_dim = input->numel() / input->dims()[0];
-    int input_offset_dim = offset->numel() / offset->dims()[0];
-    int input_mask_dim = mask->numel() / mask->dims()[0];
-    auto blas = phi::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
-    const T* input_ptr = input->data<T>();
-    const T* offset_ptr = offset->data<T>();
-    const T* mask_ptr = mask->data<T>();
-    col_buffer.mutable_data<T>(ctx.GetPlace());
-    T* col_buffer_ptr = col_buffer.data<T>();
-    for (int i = 0; i < batch_size / im2col_step; ++i) {
-      ModulatedDeformableIm2colCPU(
-          dev_ctx, input_ptr + i * im2col_step * input_dim,
-          offset_ptr + i * im2col_step * input_offset_dim,
-          mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec,
-          col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations,
-          deformable_groups, col_buffer_ptr);
-      Tensor output_3d = output_4d.Slice(i, i + 1).Resize(
-          phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
-      // get the product of pixel and weight
-      for (int g = 0; g < groups; ++g) {
-        Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
-        Tensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-        Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
-        blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0),
-                    &output_3d_slice, T(0.0));
-      }
-    }
-    output->ShareDataWith(output_buffer)
-        .Resize(phi::make_ddim(output_shape_vec));
-  }
-};
-
 template <typename T>
 class DeformableConvGradCPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 0d9fbf612f73c..35e389090175f 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -9,8 +9,10 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -235,10 +237,13 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(yolo_box, YoloBoxInferShapeFunctor,
+                            PD_INFER_META(phi::YoloBoxInferMeta));
 REGISTER_OPERATOR(
     yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    YoloBoxInferShapeFunctor);
 
 REGISTER_OP_VERSION(yolo_box)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc
index 98247fbc862bb..6959b5cf81106 100644
--- a/paddle/fluid/operators/determinant_op.cc
+++ b/paddle/fluid/operators/determinant_op.cc
@@ -13,6 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/determinant_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -20,11 +24,6 @@ namespace operators {
 class DeterminantOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant");
-  }
 };
 
 class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -44,19 +43,6 @@ class DeterminantGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input",
-                   "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
-                   framework::GradVarName("Out"), "DeterminantGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output",
-                   framework::GradVarName("Input"), "DeterminantGradOp");
-
-    ctx->SetOutputDim(framework::GradVarName("Input"),
-                      ctx->GetInputDim("Input"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -162,19 +148,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer,
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+DECLARE_INFER_SHAPE_FUNCTOR(determinant, DeterminantInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker,
                   ops::DeterminantGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DeterminantGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp)
+                  ops::DeterminantGradOpMaker<paddle::imperative::OpBase>,
+                  DeterminantInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(determinant,
-                       ops::DeterminantKernel<plat::CPUDeviceContext, float>,
-                       ops::DeterminantKernel<plat::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    determinant_grad, ops::DeterminantGradKernel<plat::CPUDeviceContext, float>,
-    ops::DeterminantGradKernel<plat::CPUDeviceContext, double>);
+DECLARE_INFER_SHAPE_FUNCTOR(determinant_grad, DeterminantGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
+REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp,
+                  DeterminantGradInferShapeFunctor);
 
 REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp,
                   ops::SlogDeterminantOpMaker,
diff --git a/paddle/fluid/operators/determinant_op.cu b/paddle/fluid/operators/determinant_op.cu
index d19d4c3d09386..d8237fa3004e6 100644
--- a/paddle/fluid/operators/determinant_op.cu
+++ b/paddle/fluid/operators/determinant_op.cu
@@ -17,14 +17,6 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    determinant, ops::DeterminantKernel<plat::CUDADeviceContext, float>,
-    ops::DeterminantKernel<plat::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    determinant_grad,
-    ops::DeterminantGradKernel<plat::CUDADeviceContext, float>,
-    ops::DeterminantGradKernel<plat::CUDADeviceContext, double>);
 
 REGISTER_OP_CUDA_KERNEL(
     slogdeterminant, ops::SlogDeterminantKernel<plat::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index f89ecd3722287..a1fe8a25665ec 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -22,12 +22,15 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
@@ -40,232 +43,6 @@ T sign(T val) {
   return static_cast<T>(T(0) < val) - (val < T(0));
 }
 
-template <typename T>
-class EigenMatrix {};
-
-template <>
-class EigenMatrix<float> {
- public:
-  using MatrixType = Eigen::MatrixXf;
-};
-
-template <>
-class EigenMatrix<double> {
- public:
-  using MatrixType = Eigen::MatrixXd;
-};
-
-inline int64_t GetBatchCount(const framework::DDim dims) {
-  int64_t batch_count = 1;
-  auto dim_size = dims.size();
-  PADDLE_ENFORCE_GE(
-      dim_size, 2,
-      platform::errors::InvalidArgument(
-          "the input matrix dimension size should greater than 2."));
-
-  // Cumulative multiplying each dimension until the last 2 to get the batch
-  // count,
-  // for example a tensor with shape [3,3,3,3], the batch count of matrices is
-  // 9.
-  for (int64_t i = 0; i < dims.size() - 2; i++) {
-    batch_count *= dims[i];
-  }
-
-  return batch_count;
-}
-
-template <typename T>
-struct DeterminantFunctor {
-  void operator()(const Tensor& input, const framework::ExecutionContext ctx,
-                  int64_t rank, int64_t batch_count, Tensor* output) {
-    std::vector<T> input_vec;
-    std::vector<T> output_vec;
-    framework::TensorToVector(input, ctx.device_context(), &input_vec);
-    for (int64_t i = 0; i < batch_count; ++i) {  // maybe can be parallel
-      auto begin_iter = input_vec.begin() + i * rank * rank;
-      auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
-      std::vector<T> sub_vec(begin_iter,
-                             end_iter);  // get every square matrix data
-      typename EigenMatrix<T>::MatrixType matrix(rank, rank);
-      for (int64_t i = 0; i < rank; ++i) {
-        for (int64_t j = 0; j < rank; ++j) {
-          matrix(i, j) = sub_vec[rank * i + j];
-        }
-      }
-      output_vec.push_back(matrix.determinant());
-    }
-    framework::TensorFromVector(output_vec, output);
-  }
-};
-template <typename DeviceContext, typename T>
-class DeterminantKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("Input");
-    auto input_dim = vectorize(input->dims());
-    auto input_dim_size = input_dim.size();
-    auto* output = context.Output<framework::Tensor>("Out");
-
-    auto batch_count = GetBatchCount(input->dims());
-    VLOG(2) << "input dim:" << input->dims();
-    PADDLE_ENFORCE_GE(
-        input_dim_size, 2,
-        platform::errors::InvalidArgument(
-            "the input matrix dimension size should greater than 2."));
-    PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1],
-                      input_dim[input_dim_size - 2],
-                      platform::errors::InvalidArgument(
-                          "the input matrix should be square matrix."));
-    auto rank = input_dim[input_dim_size - 1];  // square matrix length
-    DeterminantFunctor<T>()(*input, context, rank, batch_count, output);
-    auto output_dims = phi::slice_ddim(input->dims(), 0, input_dim_size - 2);
-    if (input_dim_size > 2) {
-      output->Resize(output_dims);
-    } else {
-      // when input is a two-dimension matrix, The det value is a number.
-      output->Resize({1});
-    }
-    VLOG(2) << "output dim:" << output->dims();
-  }
-};
-
-template <typename T>
-struct FoundZeroFunctor {
-  FoundZeroFunctor(const T* x, int64_t numel, bool* res)
-      : x_(x), numel_(numel), res_(res) {}
-  HOSTDEVICE void operator()(size_t idx) const {
-    if (*res_ || idx >= static_cast<size_t>(numel_)) {
-      // founded zero number
-      return;
-    }
-    *res_ = (x_[idx] == static_cast<T>(0));
-  }
-  const T* x_;
-  int64_t numel_;
-  bool* res_;
-};
-
-template <typename DeviceContext, typename T>
-inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx,
-                                  const framework::Tensor* det) {
-  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  auto numel = det->numel();
-
-  framework::Tensor dev_tensor;
-  auto* data = dev_tensor.mutable_data<bool>({1}, ctx.GetPlace());
-
-  // set false
-  phi::funcs::SetConstant<DeviceContext, bool> zero;
-  zero(dev_ctx, &dev_tensor, false);
-
-  // find whether zero
-  platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-  FoundZeroFunctor<T> functor(det->data<T>(), numel, data);
-  for_range(functor);
-
-  // copy to host
-  dev_ctx.Wait();
-  framework::Tensor cpu_tensor;
-  framework::TensorCopy(dev_tensor, platform::CPUPlace(), &cpu_tensor);
-
-  // if founded zero, the matrix is not invertible
-  // else the matrix is invertible
-  auto* res = cpu_tensor.data<bool>();
-  return !(*res);
-}
-
-template <typename DeviceContext, typename T>
-class DeterminantGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& orig_dev_ctx = context.template device_context<DeviceContext>();
-    const auto* input = context.Input<framework::Tensor>("Input");
-    const auto* det = context.Input<framework::Tensor>("Out");
-    const auto* grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* ddet =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-
-    auto input_dims_size = input->dims().size();
-    if (input_dims_size > 2) {
-      PADDLE_ENFORCE_EQ(
-          grad->dims().size() + 2, input_dims_size,
-          platform::errors::InvalidArgument(
-              "The grad tensor of det dims size should 2 less than"
-              " input tensor's, but here differ %d",
-              input_dims_size - grad->dims().size()));
-    } else if (input_dims_size == 2) {
-      // input dims size 2 and grad dims size 1 is possible
-      PADDLE_ENFORCE_EQ(
-          grad->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The grad tensor of det dims size should 2 less than"
-              " input tensor's, but here differ %d",
-              input_dims_size - grad->dims().size()));
-    } else {
-      // checked in forward, pass
-    }
-
-    auto& dev_ctx = static_cast<
-        const typename framework::ConvertToPhiContext<DeviceContext>::TYPE&>(
-        orig_dev_ctx);
-
-    // Check Whether the matrix is invertible
-    // (matrix A not invertible) == (det(A)=0)
-    if (!CheckMatrixInvertible<DeviceContext, T>(context, det)) {
-      // The matrix is not invertible
-      VLOG(3) << "The input matrix not invertible!";
-      ddet->Resize(input->dims());
-      phi::Full<T>(dev_ctx, phi::vectorize(input->dims()), static_cast<T>(0.0f),
-                   ddet);
-      return;
-    }
-
-    // The matrix is invertible
-    // let |A| = Determinant(A)
-    // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
-    // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
-    // -1)
-
-    // First: inverse(A)
-    framework::Tensor inverse_A;
-    // A must be square matrices!
-    inverse_A.Resize(input->dims());
-    inverse_A.mutable_data<T>(context.GetPlace());
-
-    phi::funcs::MatrixInverseFunctor<DeviceContext, T> mat_inv;
-    mat_inv(orig_dev_ctx, *input, &inverse_A);
-
-    VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
-
-    // Second: inverse(A).transpose(-2, -1)
-    framework::Tensor transpose_inverse_A =
-        phi::TransposeLast2Dim<T>(dev_ctx, inverse_A);
-
-    VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
-            << transpose_inverse_A.dims();
-
-    // Third: dA * |A|
-    auto mul_dA_detA = phi::Multiply<T>(dev_ctx, *grad, *det);
-    VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
-
-    // Fourth: unsqueeze(dA * |A|, [-1, -2])
-    auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1);
-    auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
-    VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
-
-    // Finally: unsqueeze(dA * |A|) * inverse(A)
-    auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
-
-    VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
-
-    framework::TensorCopy(res, context.GetPlace(), ddet);
-
-    ddet->Resize(input->dims());
-    VLOG(3) << "d|A| dims: " << ddet->dims();
-  }
-};
-
 template <typename T>
 struct SlogDeterminantFunctor {
   void operator()(const Tensor& input, const framework::ExecutionContext ctx,
@@ -280,7 +57,7 @@ struct SlogDeterminantFunctor {
       auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
       std::vector<T> sub_vec(begin_iter,
                              end_iter);  // get every square matrix data
-      typename EigenMatrix<T>::MatrixType matrix(rank, rank);
+      typename phi::detail::EigenMatrix<T>::MatrixType matrix(rank, rank);
       for (int64_t i = 0; i < rank; ++i) {
         for (int64_t j = 0; j < rank; ++j) {
           matrix(i, j) = sub_vec[rank * i + j];
@@ -311,7 +88,7 @@ class SlogDeterminantKernel : public framework::OpKernel<T> {
     auto input_dim_size = input_dim.size();
     auto* output = context.Output<framework::Tensor>("Out");
 
-    auto batch_count = GetBatchCount(input->dims());
+    auto batch_count = phi::detail::GetBatchCount(input->dims());
     VLOG(2) << "input dim:" << input->dims();
     PADDLE_ENFORCE_GE(
         input_dim_size, 2,
@@ -370,7 +147,9 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
     // (matrix A not invertible) == (absslogdet(A)=0)
     auto slogdet_vec = slogdet->Split(1, 0);
     auto absslogdet_val = slogdet_vec[0];
-    if (!CheckMatrixInvertible<DeviceContext, T>(context, &absslogdet_val)) {
+    if (!phi::detail::CheckMatrixInvertible<
+            T, typename framework::ConvertToPhiContext<DeviceContext>::TYPE>(
+            dev_ctx, &absslogdet_val)) {
       // The matrix is not invertible
       VLOG(3) << "The input matrix not invertible!";
       dslogdet->Resize(input->dims());
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 144198367d538..94db4c62e3912 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -35,143 +35,99 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/functors.h"
-
 namespace paddle {
 namespace operators {
+template <typename T1, typename T2 = T1, typename OutT = T1>
+struct DstMaskGenerator {
+  const float dropout_prob_;
+  const bool is_upscale_in_train_;
+  using MT = typename details::MPTypeTrait<T1>::Type;
+  MT factor;
+  HOSTDEVICE inline DstMaskGenerator(const float dropout_prob,
+                                     const bool is_upscale_in_train)
+      : dropout_prob_(dropout_prob), is_upscale_in_train_(is_upscale_in_train) {
+    factor = static_cast<MT>(1.0f / (1.0f - dropout_prob_));
+  }
 
-template <typename T, typename MaskType>
-__global__ void RandomGenerator(const size_t n, uint64_t seed,
-                                const float dropout_prob, const T* src,
-                                MaskType* mask, T* dst,
-                                bool is_upscale_in_train, uint64_t increment) {
-  using MT = typename details::MPTypeTrait<T>::Type;
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-#ifdef PADDLE_WITH_HIP
-  hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, idx, increment, &state);
-#else
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, increment, &state);
-#endif
-
-  MaskType mask_val;
-  T dst_val;
-  MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
-  for (; idx < n; idx += blockDim.x * gridDim.x) {
-    T src_val = src[idx];
-#ifdef PADDLE_WITH_HIP
-    if (hiprand_uniform(&state) < dropout_prob) {
-#else
-    if (curand_uniform(&state) < dropout_prob) {
-#endif
-      mask_val = 0;
-      dst_val = 0;
-    } else {
-      mask_val = 1;
-      dst_val = is_upscale_in_train
-                    ? static_cast<T>(static_cast<MT>(src_val) * factor)
-                    : src_val;
+  HOSTDEVICE inline void operator()(OutT* dst, const T1* src_val,
+                                    const T2* rand, int num) const {
+    static constexpr int kCount =
+        phi::funcs::uniform_distribution<T2>::kReturnsCount;
+// 0 ~ kCount -1 is dist , kCount ~ 2 * kCount - 1 is mask
+#pragma unroll
+    for (int i = 0; i < kCount; i++) {
+      if (rand[i] < dropout_prob_) {
+        dst[i] = static_cast<T1>(0);
+        dst[i + kCount] = dst[i];
+      } else {
+        dst[i] = is_upscale_in_train_
+                     ? static_cast<T1>(static_cast<MT>(src_val[i]) * factor)
+                     : static_cast<T1>(src_val[i]);
+        dst[i + kCount] = static_cast<T1>(1);
+      }
     }
-    mask[idx] = mask_val;
-    dst[idx] = dst_val;
   }
-}
+};
 
-template <typename T, typename MaskType, int VecSize>
+template <typename T, typename MaskType>
 __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
                                           const float dropout_prob,
                                           const T* src, MaskType* mask, T* dst,
                                           bool is_upscale_in_train,
-                                          uint64_t increment) {
-  using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = phi::AlignedVector<T, VecSize>;
-  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
-
+                                          uint64_t increment,
+                                          size_t main_offset) {
+  size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
+  static constexpr int kCount =
+      phi::funcs::uniform_distribution<float>::kReturnsCount;
+  size_t stride = BLOCK_NUM_X * GRID_NUM_X * kCount;
 #ifdef PADDLE_WITH_HIP
-  int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
   hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, idx, increment, &state);
+  hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = hiprandStatePhilox4_32_10_t;
 #else
-  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, increment, &state);
-#endif
-
-  MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
-  for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) {
-    LoadT src_val;
-    phi::Load<T, VecSize>(&src[i], &src_val);
-
-#ifdef PADDLE_WITH_HIP
-    float4 rand = hiprand_uniform4(&state);
-#else
-    float4 rand = curand_uniform4(&state);
+  curand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = curandStatePhilox4_32_10_t;
 #endif
-
-    LoadT dst_val;
-    MaskLoadT mask_val;
-
-#pragma unroll
-    for (int j = 0; j < VecSize; j++) {
-      if ((&rand.x)[j] < dropout_prob) {
-        dst_val[j] = 0;
-        mask_val[j] = 0;
-      } else {
-        dst_val[j] = is_upscale_in_train
-                         ? static_cast<T>(static_cast<MT>(src_val[j]) * factor)
-                         : src_val[j];
-        mask_val[j] = 1;
-      }
-    }
-
-    phi::Store<T, VecSize>(dst_val, &dst[i]);
-    phi::Store<MaskType, VecSize>(mask_val, &mask[i]);
+  T dst_mask[kCount * 2];  // 0 ~ kCount -1 : dst;kCount ~ 2 * kCount - 1: mask
+  float rands[kCount];
+  MaskType mask_result[kCount];
+  using Rand = phi::funcs::uniform_distribution<float>;
+  using Cast = kps::IdentityFunctor<T>;
+  int deal_size = BLOCK_NUM_X * kCount;
+  auto dst_functor =
+      DstMaskGenerator<T, float>(dropout_prob, is_upscale_in_train);
+  size_t fix = idx * kCount;
+  for (; fix < main_offset; fix += stride) {
+    kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
+    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
+                                                          &state);
+    // dst
+    kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
+        &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
+    kps::WriteData<T, kCount, 1, 1, false>(dst + fix, &dst_mask[0], deal_size);
+    // mask
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+        &mask_result[0], &dst_mask[kCount], Cast());
+    kps::WriteData<MaskType, kCount, 1, 1, false>(mask + fix, &mask_result[0],
+                                                  deal_size);
   }
-}
-
-template <typename T, typename MaskType>
-struct CudaDropoutGradFunctor {
-  using MT = typename details::MPTypeTrait<T>::Type;
-
-  explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}
-
-  __device__ __forceinline__ T operator()(const T dout,
-                                          const MaskType mask) const {
-    return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) *
-                          factor_);
-  }
-
- private:
-  MT factor_;
-};
-
-template <typename T, typename MaskType, int VecSize>
-__global__ void DropoutGradCUDAKernel(
-    const T* dout, const MaskType* mask,
-    const typename details::MPTypeTrait<T>::Type factor, const int64_t size,
-    T* dx) {
-  using MT = typename details::MPTypeTrait<T>::Type;
-  using LoadT = phi::AlignedVector<T, VecSize>;
-  using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
-
-  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
-    LoadT dout_val;
-    phi::Load<T, VecSize>(&dout[i], &dout_val);
-
-    MaskLoadT mask_val;
-    phi::Load<MaskType, VecSize>(&mask[i], &mask_val);
-
-    LoadT dx_val;
-
-#pragma unroll
-    for (int j = 0; j < VecSize; j++) {
-      dx_val[j] = static_cast<T>(static_cast<MT>(dout_val[j]) *
-                                 static_cast<MT>(mask_val[j]) * factor);
-    }
-
-    phi::Store<T, VecSize>(dx_val, &dx[i]);
+  int remainder = n - fix;
+  if (remainder > 0) {
+    kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
+    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
+                                                          &state);
+    // dst
+    kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
+        &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
+    kps::WriteData<T, kCount, 1, 1, true>(dst + fix, &dst_mask[0], remainder);
+    // mask
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+        &mask_result[0], &dst_mask[kCount], Cast());
+    kps::WriteData<MaskType, kCount, 1, 1, true>(mask + fix, &mask_result[0],
+                                                 remainder);
   }
 }
 
@@ -218,42 +174,21 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
     uint64_t seed_data;
     uint64_t increment;
     // VectorizedRandomGenerator use curand_uniform4, so we only support
-    // vec_size is 4;
-    int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
+    // kVecSize is 4;
+    constexpr int kVecSize =
+        phi::funcs::uniform_distribution<float>::kReturnsCount;
     auto gpu_config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, kVecSize);
     auto offset =
-        ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
-
+        ((x_numel - 1) / (gpu_config.GetThreadNum() * kVecSize) + 1) * kVecSize;
     GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset,
                             &seed_data, &increment);
-
-#ifdef __HIPCC__
-    if (vec_size == 4 && size % 4 == 0) {
-      hipLaunchKernelGGL(
-          HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>),
-          gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream, size,
-          seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train,
-          increment);
-    } else {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
-                         gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0,
-                         stream, size, seed_data, dropout_prob, x_data,
-                         mask_data, y_data, upscale_in_train, increment);
-    }
-#else
-    if (vec_size == 4 && size % 4 == 0) {
-      VectorizedRandomGenerator<T, uint8_t, 4><<<
-          gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
-          size, seed_data, dropout_prob, x_data, mask_data, y_data,
-          upscale_in_train, increment);
-    } else {
-      RandomGenerator<T, uint8_t><<<gpu_config.block_per_grid,
-                                    gpu_config.thread_per_block, 0, stream>>>(
-          size, seed_data, dropout_prob, x_data, mask_data, y_data,
-          upscale_in_train, increment);
-    }
-#endif
+    size_t main_offset = size / (gpu_config.GetBlockSize() * kVecSize) *
+                         (gpu_config.GetBlockSize() * kVecSize);
+    VectorizedRandomGenerator<T, uint8_t><<<
+        gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream>>>(
+        size, seed_data, dropout_prob, x_data, mask_data, y_data,
+        upscale_in_train, increment, main_offset);
   } else {
     if (upscale_in_train) {
 // todo: can y share with data with x directly?
@@ -278,6 +213,22 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
   }
 }
 
+template <typename T, typename MaskType>
+struct CudaDropoutGradFunctor {
+  using MT = typename details::MPTypeTrait<T>::Type;
+
+  explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}
+
+  __device__ __forceinline__ T operator()(const T dout,
+                                          const MaskType mask) const {
+    return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) *
+                          factor_);
+  }
+
+ private:
+  MT factor_;
+};
+
 template <typename T>
 void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
                                 const std::string dropout_implementation,
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 6d52ce45c4c10..3d9950902acfe 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,17 +27,6 @@ class DropoutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dropout");
-
-    auto x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", x_dims);
-    if (ctx->Attrs().Get<bool>("is_test") == false) {
-      ctx->SetOutputDim("Mask", x_dims);
-    }
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -173,7 +164,11 @@ class DropoutGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(dropout, DropoutInferShapeFunctor,
+                            PD_INFER_META(phi::DropoutInferMeta));
+
 REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
                   ops::DropoutGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DropoutGradOpMaker<paddle::imperative::OpBase>);
+                  ops::DropoutGradOpMaker<paddle::imperative::OpBase>,
+                  DropoutInferShapeFunctor);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index 5e4c83e1a45eb..6daf05a9d778d 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -21,13 +21,13 @@
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/slice.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index a995877778e47..c28abb916b7a7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 
 // only can include the headers in paddle/phi/include dirs
 #include "paddle/phi/kernels/elementwise_grad_kernel.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 58a3123c7e332..6f4aba93d56e2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 838df2e162591..f9347d281043e 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,100 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto tz = phi::vectorize<int64_t>(dout->dims());
-    memory::data_type dout_type = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(dout->dtype()));
-    platform::ReorderMKLDNNHandler handler(
-        tz, framework::TransToProtoVarType(dout->dtype()), dout_type,
-        onednn_engine);
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    auto reorder_src_memory_p = handler.AcquireSrcMemory(
-        dout->format(), platform::to_void_cast(dout->data<T>()));
-
-    if (dx) {
-      auto reorder_dst_memory_p =
-          handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
-      auto reorder_p =
-          handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-      platform::RecordEvent record_reorder(
-          "int_reorder", platform::TracerEventType::UserDefined, 2,
-          platform::EventRole::kUniqueOp);
-      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-      astream.wait();
-
-      dx->set_layout(DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-    }
-
-    if (dy) {
-      // Direct copy
-      if (dout->dims() == dy->dims()) {
-        auto reorder_dst_memory_p =
-            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
-        auto reorder_p =
-            handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-        platform::RecordEvent record_reorder(
-            "int_reorder", platform::TracerEventType::UserDefined, 2,
-            platform::EventRole::kUniqueOp);
-        reorder_p->execute(astream, *reorder_src_memory_p,
-                           *reorder_dst_memory_p);
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-      } else {
-        // Broadcasting
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(
@@ -116,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_add>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_add>)
 
-REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseAddMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseAddMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_add>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_add>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
index 367d602f5902e..c68aa8d3d1b46 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
@@ -1,146 +1,28 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
-
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Input<framework::Tensor>("Out");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    if (dx) {
-      // dx = dout / y
-
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, y, dx, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
-      const auto dst_dx_memory = handler.AcquireDstMemory(dx);
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_y_memory},
-          {DNNL_ARG_DST, *dst_dx_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
-    }
-
-    if (dy) {
-      // dy = -dout * out / y
-
-      platform::BinaryMKLDNNHandler<T> y_handler(
-          dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y,
-          y, nullptr, 1.0f, 1.0f, 1.0f);
-
-      const auto y_memory = y_handler.AcquireSrcMemory(y);
-
-      dnnl::post_ops po;
-      po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc());
-
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, out, nullptr, -1.0f, 1.0f, 1.0f, po);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_out_memory = handler.AcquireSecondSrcMemory(out);
-
-      // If broadcasting is in use then let's write to temporary
-      // buffer allocated by oneDNN
-      const auto dst_dy_memory = (dout->dims() == dy->dims())
-                                     ? handler.AcquireDstMemory(dy)
-                                     : handler.AcquireDstMemory();
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_out_memory},
-          {DNNL_ARG_DST, *dst_dy_memory},
-          {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dy->set_layout(framework::DataLayout::kMKLDNN);
-
-      // Reduction is needed for broadcasting scenario
-      if (dout->dims() != dy->dims()) {
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-
-        // As source we use mem object with results from binary operation
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-
-      } else {
-        dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-// TODO(piotrekobi) add int8, uint8 support
-REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace,
-                   ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_div>,
-                   ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
-                                            dnnl::algorithm::binary_div>)
-
-REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace,
-                   ops::EltwiseDivMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseDivMKLDNNGradKernel<float>)
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace,
+                   ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_div>,
+                   ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
+                                            dnnl::algorithm::binary_div>)
+
+REGISTER_OP_KERNEL(
+    elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_div>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_div>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index ad8fd31701390..d1a1aa3008c8b 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -15,20 +15,35 @@
 #pragma once
 #include <string>
 #include <unordered_map>
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 #include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::stream;
+using framework::DataLayout;
+using framework::Tensor;
+
+inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
+                                                     const Tensor* y) {
+  const auto src_tz = phi::vectorize(x->dims());
+  const auto dst_tz = phi::vectorize(y->dims());
+
+  size_t j = 0;
+  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
+  for (size_t i = 0; i < src_tz.size(); ++i) {
+    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
+    if (j == dst_tz.size()) break;
+  }
+
+  return dst_tz_ex;
+}
 
 template <typename T, dnnl::algorithm BINARY_OP>
 class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
@@ -103,7 +118,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     // operation.
     const bool reuse_x_memopry =
         x->numel() == z->numel() && x->IsSharedBufferWith(*z);
-    std::shared_ptr<dnnl::memory> dst_memory = nullptr;
+    std::shared_ptr<dnnl::memory> dst_memory;
     if (reuse_x_memopry) {
       dst_memory = src_x_memory;
       // NOTE(chenfeiyu): when the output reuses memory from other tensor rather
@@ -135,19 +150,187 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
   }
 };
 
-inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
-                                                     const Tensor* y) {
-  const auto src_tz = phi::vectorize(x->dims());
-  const auto dst_tz = phi::vectorize(y->dims());
+template <typename T, dnnl::algorithm BINARY_OP>
+class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+    using Tensor = framework::Tensor;
 
-  size_t j = 0;
-  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
-  for (size_t i = 0; i < src_tz.size(); ++i) {
-    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
-    if (j == dst_tz.size()) break;
-  }
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
 
-  return dst_tz_ex;
-}
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    int axis = ctx.Attr<int>("axis");
+
+    auto tz = phi::vectorize<int64_t>(dout->dims());
+    auto proto_type_dout = framework::TransToProtoVarType(dout->dtype());
+
+    platform::ReorderMKLDNNHandler reorder_handler(
+        tz, proto_type_dout, framework::ToMKLDNNDataType(proto_type_dout),
+        onednn_engine);
+
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        dout->format(), platform::to_void_cast(dout->data<T>()));
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    if (dx) {
+      std::shared_ptr<dnnl::memory> dst_memory;
+
+      // elementwise_add & elementwise_sub
+      if (BINARY_OP == dnnl::algorithm::binary_add ||
+          BINARY_OP == dnnl::algorithm::binary_sub) {
+        dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(),
+                                                      ctx.GetPlace());
+        auto reorder_p =
+            reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p);
+        platform::RecordEvent record_reorder(
+            "int_reorder", platform::TracerEventType::UserDefined, 2,
+            platform::EventRole::kUniqueOp);
+
+        reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory);
+      } else {  // elementwise_mul & elementwise_div
+        platform::BinaryMKLDNNHandler<T> binary_handler(
+            BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f,
+            1.0f, 1.0f);
+
+        const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout);
+        const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y);
+        dst_memory = binary_handler.AcquireDstMemory(dx);
+
+        const auto binary_prim = binary_handler.AcquireForwardPrimitive();
+
+        const std::unordered_map<int, dnnl::memory> args = {
+            {DNNL_ARG_SRC_0, *src_dout_memory},
+            {DNNL_ARG_SRC_1, *src_y_memory},
+            {DNNL_ARG_DST, *dst_memory}};
+
+        binary_prim->execute(astream, args);
+      }
+      astream.wait();
+
+      dx->set_layout(framework::DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    }
+
+    if (dy) {
+      dnnl::primitive_attr broadcast_reduction_attr;
+      std::shared_ptr<dnnl::memory> broadcast_src_memory;
+      std::shared_ptr<dnnl::memory> dst_memory;
+
+      // elementwise_add & elementwise_sub
+      if (BINARY_OP == dnnl::algorithm::binary_add ||
+          BINARY_OP == dnnl::algorithm::binary_sub) {
+        if (dout->dims() == dy->dims()) {
+          auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+              dy, dout->format(), ctx.GetPlace());
+
+          dnnl::primitive_attr reorder_attr;
+          std::vector<float> scales(1);
+          scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1;
+          reorder_attr.set_output_scales(0, scales);
+          auto reorder_p = std::make_shared<dnnl::reorder>(
+              *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
+          platform::RecordEvent record_reorder(
+              "int_reorder", platform::TracerEventType::UserDefined, 2,
+              platform::EventRole::kUniqueOp);
+          reorder_p->execute(astream, *reorder_src_memory_p,
+                             *reorder_dst_memory_p);
+
+          dst_memory = reorder_dst_memory_p;
+        } else {
+          broadcast_src_memory = reorder_src_memory_p;
+        }
+      } else {  // elementwise_mul & elementwise_div
+        std::unordered_map<int, dnnl::memory> args;
+        std::shared_ptr<dnnl::binary> binary_prim;
+        std::shared_ptr<dnnl::memory> post_op_memory;
+        std::shared_ptr<dnnl::memory> src_0_memory;
+        std::shared_ptr<dnnl::memory> src_1_memory;
+
+        platform::BinaryMKLDNNHandler<T> binary_handler(
+            dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(),
+            dout, x, nullptr, 1.0f, 1.0f, 1.0f);
+
+        src_1_memory = binary_handler.AcquireSecondSrcMemory(x);
+
+        if (BINARY_OP == dnnl::algorithm::binary_div) {
+          platform::BinaryMKLDNNHandler<T> post_op_binary_handler(
+              dnnl::algorithm::binary_div, axis, onednn_engine, ctx.GetPlace(),
+              y, y, nullptr, 1.0f, 1.0f, 1.0f);
+
+          post_op_memory = post_op_binary_handler.AcquireSrcMemory(y);
+
+          dnnl::post_ops po;
+          po.append_binary(dnnl::algorithm::binary_div,
+                           post_op_memory->get_desc());
+
+          binary_handler = platform::BinaryMKLDNNHandler<T>(
+              dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(),
+              dout, out, nullptr, -1.0f, 1.0f, 1.0f, po);
+
+          src_1_memory = binary_handler.AcquireSecondSrcMemory(out);
+        }
+
+        src_0_memory = binary_handler.AcquireSrcMemory(dout);
+
+        const auto dst_dy_memory = (dout->dims() == dy->dims())
+                                       ? binary_handler.AcquireDstMemory(dy)
+                                       : binary_handler.AcquireDstMemory();
+
+        binary_prim = binary_handler.AcquireForwardPrimitive();
+        args = {{DNNL_ARG_SRC_0, *src_0_memory},
+                {DNNL_ARG_SRC_1, *src_1_memory},
+                {DNNL_ARG_DST, *dst_dy_memory}};
+
+        if (BINARY_OP == dnnl::algorithm::binary_div)
+          args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1,
+                       *post_op_memory});
+
+        binary_prim->execute(astream, args);
+        broadcast_src_memory = dst_dy_memory;
+        dst_memory = dst_dy_memory;
+      }
+      astream.wait();
+      dy->set_layout(DataLayout::kMKLDNN);
+
+      if (dout->dims() != dy->dims()) {
+        // Broadcasting
+        if (BINARY_OP == dnnl::algorithm::binary_sub) {
+          dnnl::post_ops po;
+          po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
+          broadcast_reduction_attr.set_post_ops(po);
+        }
+
+        platform::ReductionMKLDNNHandler<T> reduction_handler(
+            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
+            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy),
+            broadcast_reduction_attr);
+        dst_memory = reduction_handler.AcquireDstMemory(dy);
+
+        auto reduction_p = reduction_handler.AcquireForwardPrimitive();
+
+        reduction_p->execute(astream, {
+                                          {DNNL_ARG_SRC, *broadcast_src_memory},
+                                          {DNNL_ARG_DST, *dst_memory},
+                                      });
+        astream.wait();
+        dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape(
+            phi::vectorize<int64_t>(dy->dims()))));
+      } else {
+        dy->set_format(platform::GetMKLDNNFormat(*dst_memory));
+      }
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index c03794012ff3b..0ef5c5e628ce6 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -1,127 +1,19 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    if (dx) {
-      // dx = dout*y
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, y, dx, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
-      const auto dst_dx_memory = handler.AcquireDstMemory(dx);
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_y_memory},
-          {DNNL_ARG_DST, *dst_dx_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
-    }
-
-    if (dy) {
-      // dy = dout*x
-      // Handler is having nullptr passed instead of output tensor as
-      // we want Dst buffer to be allocated by oneDNN not to use Tensor
-      platform::BinaryMKLDNNHandler<T> handler(
-          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
-          dout, x, nullptr, 1.0f, 1.0f, 1.0f);
-
-      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
-      const auto src_x_memory = handler.AcquireSecondSrcMemory(x);
-
-      // If broadcasting is in use then let's write to temporary
-      // buffer allocated by oneDNN
-      const auto dst_dy_memory = (dout->dims() == dy->dims())
-                                     ? handler.AcquireDstMemory(dy)
-                                     : handler.AcquireDstMemory();
-
-      const auto binary_prim = handler.AcquireForwardPrimitive();
-
-      const std::unordered_map<int, dnnl::memory> args = {
-          {DNNL_ARG_SRC_0, *src_dout_memory},
-          {DNNL_ARG_SRC_1, *src_x_memory},
-          {DNNL_ARG_DST, *dst_dy_memory}};
-
-      binary_prim->execute(astream, args);
-      astream.wait();
-
-      dy->set_layout(framework::DataLayout::kMKLDNN);
-
-      // Reduction is needed for broadcasting scenario
-      if (dout->dims() != dy->dims()) {
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-        // As source we use mem object with results from binary operation
-        reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
-                                       {DNNL_ARG_DST, *dy_memory_p}});
-        astream.wait();
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-
-      } else {
-        dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(
@@ -132,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_mul>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_mul>)
 
-REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseMulMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseMulMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_mul>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_mul>)
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
index 3c799008a2abc..510373831eb6d 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
@@ -1,5 +1,4 @@
-
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,113 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename T>
-class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto tz = phi::vectorize<int64_t>(dout->dims());
-    memory::data_type dout_type = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(dout->dtype()));
-    platform::ReorderMKLDNNHandler handler(
-        tz, framework::TransToProtoVarType(dout->dtype()), dout_type,
-        onednn_engine);
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    auto reorder_src_memory_p = handler.AcquireSrcMemory(
-        dout->format(), platform::to_void_cast(dout->data<T>()));
-
-    if (dx) {
-      auto reorder_dst_memory_p =
-          handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
-      auto reorder_p =
-          handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-      platform::RecordEvent record_reorder(
-          "int_reorder", platform::TracerEventType::UserDefined, 2,
-          platform::EventRole::kUniqueOp);
-
-      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-      astream.wait();
-
-      dx->set_layout(DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-    }
-
-    if (dy) {
-      // Direct copy
-      if (dout->dims() == dy->dims()) {
-        auto reorder_dst_memory_p =
-            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
-
-        dnnl::primitive_attr reorder_attr;
-        std::vector<float> scales = {-1};
-        reorder_attr.set_output_scales(0, scales);
-        auto reorder_p = std::make_shared<dnnl::reorder>(
-            *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
-        platform::RecordEvent record_reorder(
-            "int_reorder", platform::TracerEventType::UserDefined, 2,
-            platform::EventRole::kUniqueOp);
-        reorder_p->execute(astream, *reorder_src_memory_p,
-                           *reorder_dst_memory_p);
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
-      } else {
-        // Broadcasting
-
-        dnnl::post_ops po;
-        po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
-        dnnl::primitive_attr attr;
-        attr.set_post_ops(po);
-
-        platform::ReductionMKLDNNHandler<T> handler_sum(
-            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
-            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr);
-
-        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
-        auto reduction_p = handler_sum.AcquireForwardPrimitive();
-
-        reduction_p->execute(astream, {
-                                          {DNNL_ARG_SRC, *reorder_src_memory_p},
-                                          {DNNL_ARG_DST, *dy_memory_p},
-                                      });
-        astream.wait();
-
-        dy->set_layout(DataLayout::kMKLDNN);
-        dy->set_format(
-            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
-                phi::vectorize<int64_t>(dy->dims()))));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
 
 namespace ops = paddle::operators;
 
@@ -131,6 +24,8 @@ REGISTER_OP_KERNEL(
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_sub>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_sub>)
 
-REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::EltwiseSubMKLDNNGradKernel<paddle::platform::bfloat16>,
-                   ops::EltwiseSubMKLDNNGradKernel<float>)
+REGISTER_OP_KERNEL(
+    elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNGradKernel<paddle::platform::bfloat16,
+                                 dnnl::algorithm::binary_sub>,
+    ops::EltwiseMKLDNNGradKernel<float, dnnl::algorithm::binary_sub>)
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 97a35a34f23e9..9361edd43bf15 100755
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -12,7 +12,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/expand_as_v2_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,27 +24,6 @@ using framework::Tensor;
 class ExpandAsV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAsV2");
-    auto x_dims = ctx->GetInputDim("X");
-    auto target_shape = ctx->Attrs().Get<std::vector<int>>("target_shape");
-    PADDLE_ENFORCE_GE(
-        target_shape.size(), static_cast<size_t>(x_dims.size()),
-        platform::errors::InvalidArgument(
-            "The rank of target_shape must be greater than or equal "
-            "to the rank of Input(X). But received Input(X): input "
-            "rank %u; received target_shape: rank %u.",
-            x_dims.size(), target_shape.size()));
-    PADDLE_ENFORCE_LE(target_shape.size(), MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank of target_shape must be less than or equal "
-                          "to %d. But received: rank %u.",
-                          MAX_RANK_SUPPORTED, target_shape.size()));
-    ctx->SetOutputDim("Out", phi::make_ddim(target_shape));
-  }
 };
 
 class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
@@ -116,9 +97,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsV2GradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(expand_as_v2, ExpandAsInferShapeFunctor,
+                            PD_INFER_META(phi::ExpandAsInferMeta));
 REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker,
                   ops::ExpandAsV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>);
+                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>,
+                  ExpandAsInferShapeFunctor);
 REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp,
                   ops::ExpandAsV2GradNoNeedBufVarsInferer);
 
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index c88a8fe196edf..c0ec44909a5f3 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -58,19 +58,15 @@ __global__ void DequantizeOneScaleQuantAxis0(const T* in, const T* scale,
 }
 
 template <typename T>
-__global__ void DequantizeOneScaleQuantAxis1(const T* in, const T* scale,
-                                             T max_range, const int num,
-                                             const int cin, const int cout,
-                                             T* out) {
-  int bid = blockIdx.x;
-  T s = scale[bid % cout];
-
-  int wh_size = num / (cin * cout);
-  const T* in_current = in + bid * wh_size;
-  T* out_current = out + bid * wh_size;
-
-  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
-    out_current[i] = in_current[i] * s / max_range;
+__global__ void DequantizeOneScaleQuantAxisN(const T* in, const T* scale,
+                                             const T max_range,
+                                             const int64_t num,
+                                             const int n_scales,
+                                             const int quant_stride, T* out) {
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
+    T s = scale[(i / quant_stride) % n_scales];
+    out[i] = in[i] * s / max_range;
   }
 }
 
@@ -98,20 +94,32 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
     const T* in_data = in->data<T>();
     T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
     if (scale_num == 1) {
-      int num = in->numel();
+      int64_t num = in->numel();
       const T* scale_factor = scales[0]->data<T>();
       if (quant_axis == 0) {
         int grid = in_dims[0];
         int block = 1024;
         DequantizeOneScaleQuantAxis0<T><<<grid, block, 0, dev_ctx.stream()>>>(
             in_data, scale_factor, max_range, num, in_dims[0], out_data);
-      } else if (quant_axis == 1) {
-        // Dequantize weight of Cin * Cout * W * H
-        int grid = in_dims[0] * in_dims[1];
-        int block = 1024;
-        DequantizeOneScaleQuantAxis1<T><<<grid, block, 0, dev_ctx.stream()>>>(
-            in_data, scale_factor, max_range, num, in_dims[0], in_dims[1],
-            out_data);
+      } else {
+        int quant_stride = 1;
+        for (int i = quant_axis + 1; i < in_dims.size(); i++) {
+          quant_stride *= in_dims[i];
+        }
+
+        int64_t block_size = std::min(
+            num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
+        int64_t max_threads =
+            dev_ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+        const int64_t max_blocks = std::max(
+            ((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+        const int64_t grid_size =
+            std::min(max_blocks, (num + block_size - 1) / block_size);
+
+        DequantizeOneScaleQuantAxisN<
+            T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            in_data, scale_factor, max_range, num, in_dims[quant_axis],
+            quant_stride, out_data);
       }
     } else if (scale_num == 2) {
       // Not need to consider quant_axis
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 9f7e4fb8d5749..01384a6cafef9 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -28,13 +28,14 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
   extern __shared__ char* shared_max_data_tmp[];
   auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
   if (gridDim.x > 1) {
-    shared_max_data[tid] = T(0);
+    T local_max_data = T(0);
     for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
       T tmp = abs(in[i]);
-      if (tmp > shared_max_data[tid]) {
-        shared_max_data[tid] = tmp;
+      if (tmp > local_max_data) {
+        local_max_data = tmp;
       }
     }
+    shared_max_data[tid] = local_max_data;
   } else {
     if (bid < n) {
       shared_max_data[tid] = abs(in[bid]);
@@ -83,13 +84,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
   int channel_size = n / c;
   const T* in_c = in + blockIdx.x * channel_size;
   extern __shared__ T shared_max_data[];
-  shared_max_data[tid] = T(0);
+  T local_max_data = T(0);
   for (int i = tid; i < channel_size; i += blockDim.x) {
     T tmp = fabs(in_c[i]);
-    if (tmp > shared_max_data[tid]) {
-      shared_max_data[tid] = tmp;
+    if (tmp > local_max_data) {
+      local_max_data = tmp;
     }
   }
+  shared_max_data[tid] = local_max_data;
   __syncthreads();
   for (int i = blockDim.x / 2; i > 0; i >>= 1) {
     if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
@@ -113,13 +115,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
   int tid = threadIdx.x;
   int bid = blockIdx.x;
   const T* in_current = in + tid * cout_wh_size + bid * wh_size;
-  shared_max_data[tid] = T(0);
+  T local_max_data = T(0);
   for (int i = 0; i < wh_size; i++) {
     T tmp = fabs(in_current[i]);
-    if (tmp > shared_max_data[tid]) {
-      shared_max_data[tid] = tmp;
+    if (tmp > local_max_data) {
+      local_max_data = tmp;
     }
   }
+  shared_max_data[tid] = local_max_data;
   __syncthreads();
 
   int len = blockDim.x;
@@ -270,18 +273,18 @@ struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
 template <typename T>
 __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
                                                     const int bin_cnt,
-                                                    const int n, const int c,
-                                                    T* out) {
+                                                    const int64_t n,
+                                                    const int c, T* out) {
   int tid = threadIdx.x;
 
-  int channel_size = n / c;
+  int64_t channel_size = n / c;
   const T* in_c = in + blockIdx.x * channel_size;
   T* out_c = out + blockIdx.x * channel_size;
 
   T s = scale[blockIdx.x];
   T inv_s = inverse(s);
 
-  for (int i = tid; i < channel_size; i += blockDim.x) {
+  for (int64_t i = tid; i < channel_size; i += blockDim.x) {
     T x = in_c[i];
     T v = x > s ? s : x;
     v = v < -s ? -s : v;
@@ -290,25 +293,20 @@ __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
   }
 }
 
-// ChannelClipAndQuantKernel for quant_axis is 1
+// ChannelClipAndQuantKernel for quant_axis is N
 template <typename T>
-__global__ void ChannelClipAndQuantKernelQuantAxis1(const T* in, const T* scale,
-                                                    const int bin_cnt,
-                                                    const int n, const int cin,
-                                                    const int cout, T* out) {
-  T s = scale[blockIdx.x % cout];
-  T inv_s = inverse(s);
-
-  int wh_size = n / (cin * cout);
-  const T* in_c = in + blockIdx.x * wh_size;
-  T* out_c = out + blockIdx.x * wh_size;
-
-  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
-    T x = in_c[i];
+__global__ void ChannelClipAndQuantKernelQuantAxisN(
+    const T* in, const T* scale, const int bin_cnt, const int64_t n,
+    const int nScale, const int quant_stride, T* out) {
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
+    T s = scale[(i / quant_stride) % nScale];
+    T inv_s = 1.0 / s;
+    T x = in[i];
     T v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt * inv_s * v;
-    out_c[i] = round(v);
+    out[i] = round(v);
   }
 }
 
@@ -324,7 +322,7 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
                                           "the received is %d",
                                           quant_axis));
 
-    int num = in.numel();
+    int64_t num = in.numel();
     auto in_dims = in.dims();
     const T* in_data = in.data<T>();
     const T* scale_data = scale.data<T>();
@@ -335,11 +333,24 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
       int block = 1024;
       ChannelClipAndQuantKernelQuantAxis0<T><<<grid, block, 0, ctx.stream()>>>(
           in_data, scale_data, bin_cnt, num, in_dims[0], out_data);
-    } else if (quant_axis == 1) {
-      int grid = in_dims[0] * in_dims[1];
-      int block = 1024;
-      ChannelClipAndQuantKernelQuantAxis1<T><<<grid, block, 0, ctx.stream()>>>(
-          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
+    } else {
+      int quant_stride = 1;
+      for (int i = quant_axis + 1; i < in_dims.size(); i++) {
+        quant_stride *= in_dims[i];
+      }
+      int64_t block_size =
+          std::min(num, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock() / 4));
+      int64_t max_threads =
+          ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+      const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
+                                          static_cast<int64_t>(1));
+
+      const int64_t grid_size =
+          std::min(max_blocks, (num + block_size - 1) / block_size);
+
+      ChannelClipAndQuantKernelQuantAxisN<T><<<grid_size, block_size>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[quant_axis], quant_stride,
+          out_data);
     }
   }
 };
@@ -404,6 +415,19 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
+template <typename T>
+__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
+                                              const T* in_accum,
+                                              const T* cur_scale, const T rate,
+                                              T* out_state, T* out_accum,
+                                              T* out_scale) {
+  T state = rate * (*in_state) + T(1.0f);
+  T accum = rate * (*in_accum) + (*cur_scale);
+  *out_state = state;
+  *out_accum = accum;
+  *out_scale = accum / state;
+}
+
 template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
@@ -415,29 +439,14 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
                   framework::Tensor* out_accum, framework::Tensor* out_scale) {
     const auto gpu_place = ctx.GetPlace();
 
-    T accum;
-    T state;
-    T scale;
-    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
-                 sizeof(T), ctx.stream());
-    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
-                 sizeof(T), ctx.stream());
-    memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
-                 ctx.stream());
-    ctx.Wait();
-
     T rate_t = static_cast<T>(rate);
-    state = rate_t * state + static_cast<T>(1.0);
-    accum = rate_t * accum + scale;
-    scale = accum / state;
-
-    memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &accum, sizeof(T), ctx.stream());
-    memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &state, sizeof(T), ctx.stream());
-    memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &scale, sizeof(T), ctx.stream());
-    ctx.Wait();
+    T* out_state_data = out_state->mutable_data<T>(gpu_place);
+    T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
+    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+
+    FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
+        in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
+        out_state_data, out_accum_data, out_scale_data);
   }
 };
 
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
index f699dac7976c5..57e7cbb74079e 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
 #include "paddle/fluid/operators/batch_size_like.h"
 
 namespace paddle {
@@ -23,9 +22,13 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
   using BatchSizeLikeOp::BatchSizeLikeOp;
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
+    framework::OpKernelType kernel_type = framework::OpKernelType(
         static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
         ctx.device_context());
+    if (ctx.Attr<bool>("force_cpu")) {
+      kernel_type.place_ = platform::CPUPlace();
+    }
+    return kernel_type;
   }
 };
 
@@ -64,15 +67,3 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::FillConstantBatchSizeLikeOpMaker,
     ops::BatchSizeLikeNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           bool>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
deleted file mode 100644
index de06aeb01e4dd..0000000000000
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           paddle::platform::float16>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           int64_t>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           bool>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
deleted file mode 100644
index 31471c6b62268..0000000000000
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto float_value = ctx.Attr<float>("value");
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    auto *in = ctx.Input<framework::LoDTensor>("Input");
-    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
-      // set the correct batch size for the LoDTensor.
-      auto odims = out->dims();
-      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
-      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
-      out->mutable_data<T>(odims, ctx.GetPlace());
-    }
-
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      std::stringstream convert_stream(str_value);
-      if (std::is_same<int64_t, T>::value) {
-        int64_t tmp_value;
-        convert_stream >> tmp_value;
-        value = static_cast<T>(tmp_value);
-      } else {
-        double tmp_value;
-        convert_stream >> tmp_value;
-        value = static_cast<T>(tmp_value);
-      }
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
-    if (cpu_place) {
-      auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
-      out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
-              out, static_cast<T>(value));
-    }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    if (!cpu_place) {
-      auto &dev_ctx = *pool.Get(ctx.GetPlace());
-      phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
-      out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
-              out, static_cast<T>(value));
-    }
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/frame_op.cc b/paddle/fluid/operators/frame_op.cc
index 9aa71e094484d..2ff9beb36f284 100644
--- a/paddle/fluid/operators/frame_op.cc
+++ b/paddle/fluid/operators/frame_op.cc
@@ -64,18 +64,26 @@ class FrameOp : public framework::OperatorWithKernel {
       end_axis = x_rank - 2;
     }
 
-    PADDLE_ENFORCE_LE(frame_length, seq_length,
-                      platform::errors::InvalidArgument(
-                          "Attribute(frame_length) of FrameOp should be less "
-                          "equal than sequence length, but got (%s) > (%s).",
-                          frame_length, seq_length));
+    bool contain_unknown_dim = phi::contain_unknown_dim(x_dims);
+    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+    if (check) {
+      PADDLE_ENFORCE_LE(frame_length, seq_length,
+                        platform::errors::InvalidArgument(
+                            "Attribute(frame_length) of FrameOp should be less "
+                            "equal than sequence length, but got (%s) > (%s).",
+                            frame_length, seq_length));
+    }
 
     // It won't go into for loop when x_rank == 1U.
     for (int i = start_axis; i <= end_axis; i++) {
       output_shape.push_back(x_dims[i]);
     }
 
-    n_frames = 1 + (seq_length - frame_length) / hop_length;
+    if (seq_length == -1) {
+      n_frames = -1;
+    } else {
+      n_frames = 1 + (seq_length - frame_length) / hop_length;
+    }
 
     if (axis == 0) {
       // (n_frames, frame_length, ...)
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc
index c445a28c084f6..e60fc44e9a6ff 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cc
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cc
@@ -120,6 +120,142 @@ class Conv2DFusionOp : public operators::ConvOp {
       ctx->SetOutputsDim("Outputs", output_shapes);
     }
   }
+
+  std::vector<int64_t> ComputeOutputShape(
+      framework::InferShapeContext* ctx) const {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Conv");
+    OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "Conv");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::string padding_algorithm =
+        ctx->Attrs().Get<std::string>("padding_algorithm");
+    int groups = ctx->Attrs().Get<int>("groups");
+    std::vector<int> dilations =
+        ctx->Attrs().Get<std::vector<int>>("dilations");
+    int dilation_size = dilations.size();
+    for (int i = 0; i < dilation_size; ++i) {
+      PADDLE_ENFORCE_GT(
+          dilations[i], 0,
+          platform::errors::InvalidArgument(
+              "The dilation of Op(Conv) should be larget than 0, but received "
+              "dilation is %d.",
+              dilations[i]));
+    }
+    const std::string data_format =
+        ctx->Attrs().Get<std::string>("data_format");
+
+    // MKL-DNN Kernels are using NCHW order of dims description
+    // so we ignore data_format consideration for MKL-DNN kernel
+    const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) &&
+                              (data_format == "NHWC" || data_format == "NDHWC");
+
+    PADDLE_ENFORCE_EQ(
+        in_dims.size() == 4 || in_dims.size() == 5, true,
+        platform::errors::InvalidArgument(
+            "The input of Op(Conv) should be a 4-D or 5-D Tensor. But "
+            "received: input's dimension is %u, input's shape is [%s].",
+            in_dims.size(), in_dims));
+
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), filter_dims.size(),
+        platform::errors::InvalidArgument(
+            "The input's dimension and filter's dimension of "
+            "Op(Conv) should be equal. But received: the input's shape is "
+            "[%s], "
+            "the input's dimension is %d; the filter's shape is [%s],  "
+            "the filter's dimension is %d.",
+            in_dims, in_dims.size(), filter_dims, filter_dims.size()));
+
+    int stride_size = strides.size();
+    for (int i = 0; i < stride_size; ++i) {
+      PADDLE_ENFORCE_GT(
+          strides[i], 0,
+          platform::errors::InvalidArgument(
+              "The stride of Op(Conv) should be larget than 0, but received "
+              "stride is %d.",
+              strides[i]));
+    }
+
+    int in_sub_stride_size = in_dims.size() - stride_size;
+    PADDLE_ENFORCE_EQ(
+        in_dims.size(), strides.size() + 2U,
+        platform::errors::InvalidArgument(
+            "The difference of input's dimension and Attr(strides)'s "
+            "length must be euqal to 2 for Op(Conv). "
+            "But received: input's dimension is %d, input's shape is [%s]; "
+            "Attr(stride)'s length is %d, Attr(stride) is [%s]; "
+            "difference of input's dimention and Attr(strides)'s length = %u.",
+            in_dims.size(), in_dims, strides.size(), phi::make_ddim(strides),
+            in_sub_stride_size));
+
+    const auto input_channels =
+        channel_last ? in_dims[in_dims.size() - 1] : in_dims[1];
+
+    PADDLE_ENFORCE_EQ(
+        input_channels, filter_dims[1] * groups,
+        platform::errors::InvalidArgument(
+            "The number of input's channels should be equal to filter's "
+            "channels "
+            "* groups for Op(Conv). But received: the input's channels is %d, "
+            "the input's shape is [%s]; the filter's channels is %d, the "
+            "filter's shape is [%s]; the groups is %d, the data_format is %s. "
+            "The error may come from wrong data_format setting.",
+            input_channels, in_dims, filter_dims[1], filter_dims, groups,
+            data_format));
+    PADDLE_ENFORCE_EQ(
+        filter_dims[0] % groups, 0,
+        platform::errors::InvalidArgument(
+            "The number of output's channels (filter's first dimension) of "
+            "Op(Conv) should be divided by groups. But received: "
+            "the output channels is %d, the filter's shape is [%s], "
+            "the groups is %d.",
+            filter_dims[0], filter_dims, groups));
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GT(
+          filter_dims[0], 0,
+          platform::errors::InvalidArgument(
+              "the size of filter at axis 0 should be greater than 0"));
+    }
+
+    framework::DDim in_data_dims;
+    if (channel_last) {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    }
+
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    std::vector<int64_t> output_shape({in_dims[0]});
+    if (!channel_last) {
+      output_shape.push_back(filter_dims[0]);
+    }
+    for (int i = 0; i < in_data_dims.size(); ++i) {
+      if ((!ctx->IsRuntime()) &&
+          (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) {
+        output_shape.push_back(-1);
+      } else {
+        output_shape.push_back(
+            ConvOutputSize(in_data_dims[i], filter_data_dims[i], dilations[i],
+                           paddings[2 * i], paddings[2 * i + 1], strides[i]));
+      }
+    }
+    if (channel_last) {
+      output_shape.push_back(filter_dims[0]);
+    }
+
+    return output_shape;
+  }
 };
 
 // TODO(qingqing): add gradient operator for conv2d_fusion
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index 18c7187fc8e64..a9b72a9cdf397 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -25,14 +25,16 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/layer_norm_kernel.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace memory = paddle::memory;
 
 USE_OP_ITSELF(dropout);
-USE_OP(layer_norm);
+USE_OP_ITSELF(layer_norm);
 
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
@@ -136,18 +138,23 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
                const platform::CUDADeviceContext &ctx) {
   framework::Scope scope;
   auto place = ctx.GetPlace();
+  paddle::optional<const framework::LoDTensor &> scale_opt = paddle::none;
   if (scale.size() > 0) {
     auto var_scale = scope.Var("Scale");
     auto tensor_scale = var_scale->GetMutable<framework::LoDTensor>();
     framework::TensorFromVector(scale, ctx, tensor_scale);
     tensor_scale->Resize({cols});
+    scale_opt = *tensor_scale;
   }
 
+  paddle::optional<const framework::LoDTensor &> bias_opt = paddle::none;
   if (bias.size() > 0) {
     auto var_bias = scope.Var("Bias");
     auto tensor_bias = var_bias->GetMutable<framework::LoDTensor>();
     framework::TensorFromVector(bias, ctx, tensor_bias);
     tensor_bias->Resize({cols});
+
+    bias_opt = *tensor_bias;
   }
 
   auto var_x = scope.Var("X");
@@ -157,20 +164,19 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
 
   auto var_y = scope.Var("Y");
   auto tensor_y = var_y->GetMutable<framework::LoDTensor>();
+  tensor_y->Resize({rows, cols});
 
   auto var_mean = scope.Var("Mean");
   auto tensor_mean = var_mean->GetMutable<framework::LoDTensor>();
+  tensor_mean->Resize({rows});
 
   auto var_variance = scope.Var("Variance");
   auto tensor_variance = var_variance->GetMutable<framework::LoDTensor>();
-
-  framework::AttributeMap attrs;
-  attrs.insert({"epsilon", epsilon});
-
-  auto op = framework::OpRegistry::CreateOp(
-      "layer_norm", {{"X", {"X"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}},
-      {{"Y", {"Y"}}, {"Mean", {"Mean"}}, {"Variance", {"Variance"}}}, attrs);
-  op->Run(scope, place);
+  tensor_variance->Resize({rows});
+  ctx.Wait();
+  phi::LayerNormKernel<T>(static_cast<const phi::GPUContext &>(ctx), *tensor_x,
+                          scale_opt, bias_opt, 1e-5, 1, false, tensor_y,
+                          tensor_mean, tensor_variance);
   framework::TensorToVector(*tensor_y, ctx, y);
   framework::TensorToVector(*tensor_mean, ctx, means);
   framework::TensorToVector(*tensor_variance, ctx, vars);
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index 032440d7f0478..c7e1f4a5463fe 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -198,7 +198,6 @@ struct TestFusedLayernormResidualDropoutBias {
             residual_vec[i * cols + j] + out2[i * cols + j];
       }
     }
-
     LayerNorm<T>(scale_vec, layernorm_bias_vec, correct_out, &correct_means,
                  &correct_vars, &correct_layernorm_out, epsilon, rows, cols,
                  *ctx);
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 8a405cc6fc1ba..9f2b48a24b447 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -12,12 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GatherOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of GatherOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of GatherOp should not be null."));
-
-    auto index_dims = ctx->GetInputDim("Index");
-
-    if (index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(
-          index_dims[1], 1,
-          platform::errors::InvalidArgument(
-              "The last dim of index should be 1 when it is 2D, but we get %d",
-              index_dims[1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          index_dims.size(), 1,
-          platform::errors::InvalidArgument(
-              "The index should be 1D, when it is not 2D, but we get %d",
-              index_dims.size()));
-    }
-
-    auto axis = ctx->Attrs().Get<int>("axis");
-    auto input_dim = ctx->GetInputDim("X");
-    if (ctx->HasInput("Axis") || axis == 0) {
-      // if HasInput("Axis"), we can not obtain correct shape of output
-      int batch_size = index_dims[0];
-      framework::DDim output_dims(input_dim);
-      output_dims[0] = batch_size;
-      ctx->SetOutputDim("Out", output_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    } else {
-      int index_size = index_dims[0];
-      std::vector<int> out_dim_vec;
-      for (int i = 0; i < axis; i++) {
-        out_dim_vec.push_back(input_dim[i]);
-      }
-      out_dim_vec.push_back(index_size);
-      for (int i = axis + 1; i < input_dim.size(); i++) {
-        out_dim_vec.push_back(input_dim[i]);
-      }
-      auto output_dims = phi::make_ddim(out_dim_vec);
-      ctx->SetOutputDim("Out", output_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -193,22 +141,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(gather, GatherInferShapeFunctor,
+                            PD_INFER_META(phi::GatherInferMeta));
 REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
                   ops::GatherGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GatherGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GatherGradOpMaker<paddle::imperative::OpBase>,
+                  GatherInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(gather_grad, GatherGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
-                  ops::GatherGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
-                       ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
-                       ops::GatherOpKernel<uint8_t>,
-                       ops::GatherOpKernel<int64_t>,
-                       ops::GatherOpKernel<phi::dtype::bfloat16>);
-REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
-                       ops::GatherGradientOpKernel<double>,
-                       ops::GatherGradientOpKernel<int>,
-                       ops::GatherGradientOpKernel<uint8_t>,
-                       ops::GatherGradientOpKernel<int64_t>,
-                       ops::GatherGradientOpKernel<phi::dtype::bfloat16>);
+                  ops::GatherGradNoNeedBufferVarInferer,
+                  GatherGradInferShapeFunctor);
+
 REGISTER_OP_VERSION(gather)
     .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
                    paddle::framework::compatible::OpVersionDesc().NewInput(
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
deleted file mode 100644
index e0db2f26d3e05..0000000000000
--- a/paddle/fluid/operators/gather_op.cu
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather_op.h"
-#include "paddle/phi/kernels/funcs/gather.cu.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class GatherOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    int axis = ctx.Attr<int>("axis");
-
-    // get axis from tensor
-    if (ctx.HasInput("Axis")) {
-      Tensor cpu_axis;
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis);
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
-        axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
-        axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT16) {
-        axis = static_cast<int>(cpu_axis.data<int16_t>()[0]);
-      }
-    }
-    const auto &place = ctx.GetPlace();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    const auto &dev_ctx = ctx.cuda_device_context();
-    if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        phi::funcs::GatherV2CUDAFunction<T, int32_t>(x, index, axis, output,
-                                                     dev_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
-                                                     dev_ctx);
-      } else if (index_type == framework::proto::VarType::INT16) {
-        phi::funcs::GatherV2CUDAFunction<T, int16_t>(x, index, axis, output,
-                                                     dev_ctx);
-      }
-      return;
-    }
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    if (index_type == framework::proto::VarType::INT32) {
-      phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT16) {
-      phi::funcs::GPUGather<T, int16_t>(dev_ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      platform::errors::PreconditionNotMet(
-                          "This kernel only runs on GPU device."));
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    int axis = ctx.Attr<int>("axis");
-    if (ctx.HasInput("Axis")) {
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      Tensor cpu_axis;
-      framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis);
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
-        axis = static_cast<int>(cpu_axis.data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
-        axis = static_cast<int>(cpu_axis.data<int64_t>()[0]);
-      }
-    }
-
-    const auto &dev_ctx = ctx.cuda_device_context();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
-                                                         dev_ctx);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        phi::funcs::GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
-                                                         dev_ctx);
-      }
-      return;
-    }
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-    if (index_type == framework::proto::VarType::INT32) {
-      phi::funcs::GPUScatterAssign<T, int>(dev_ctx, *dO, *index, dX,
-                                           ctx.Attr<bool>("overwrite"));
-    } else if (index_type == framework::proto::VarType::INT64) {
-      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX,
-                                               ctx.Attr<bool>("overwrite"));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
-                        ops::GatherOpCUDAKernel<double>,
-                        ops::GatherOpCUDAKernel<int64_t>,
-                        ops::GatherOpCUDAKernel<int>,
-                        ops::GatherOpCUDAKernel<int16_t>,
-                        ops::GatherOpCUDAKernel<plat::float16>,
-                        ops::GatherOpCUDAKernel<plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
-                        ops::GatherGradOpCUDAKernel<double>,
-                        ops::GatherGradOpCUDAKernel<int64_t>,
-                        ops::GatherGradOpCUDAKernel<int>,
-                        ops::GatherGradOpCUDAKernel<plat::float16>,
-                        ops::GatherGradOpCUDAKernel<plat::bfloat16>);
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
deleted file mode 100644
index 94de694b2f9bc..0000000000000
--- a/paddle/fluid/operators/gather_op.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/phi/kernels/funcs/scatter.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *output = ctx.Output<Tensor>("Out");
-
-    int axis = ctx.Attr<int>("axis");
-    // get axis from tensor
-    if (ctx.HasInput("Axis")) {
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type = axis_tensor->dtype();
-      if (axis_type == phi::DataType::INT32) {
-        axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == phi::DataType::INT64) {
-        axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
-      }
-    }
-    const auto &index_type = index->dtype();
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-    if (axis != 0) {
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GatherV2Function<T, int32_t>(dev_ctx, x, index, axis,
-                                                 output);
-      } else if (index_type == phi::DataType::INT64) {
-        phi::funcs::GatherV2Function<T, int64_t>(dev_ctx, x, index, axis,
-                                                 output);
-      }
-      return;
-    }
-
-    output->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == 0) return;
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::CPUGather<T, int>(dev_ctx, *x, *index, output);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::CPUGather<T, int64_t>(dev_ctx, *x, *index, output);
-    }
-  }
-};
-
-template <typename T>
-class GatherGradientOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
-
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    int axis = ctx.Attr<int>("axis");
-    if (ctx.HasInput("Axis")) {
-      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type = axis_tensor->dtype();
-      if (axis_type == phi::DataType::INT32) {
-        axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == phi::DataType::INT64) {
-        axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
-      }
-    }
-    const auto &index_type = index->dtype();
-    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-
-    if (axis != 0) {
-      if (index_type == phi::DataType::INT32) {
-        phi::funcs::GatherV2GradFunction<T, int32_t>(dev_ctx, dO, index, axis,
-                                                     dX);
-      } else if (index_type == phi::DataType::INT64) {
-        phi::funcs::GatherV2GradFunction<T, int64_t>(dev_ctx, dO, index, axis,
-                                                     dX);
-      }
-      return;
-    }
-
-    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *dev_ctx.eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
-    if (dO->numel() == 0) return;
-    bool overwrite = ctx.Attr<bool>("overwrite");
-
-    if (index_type == phi::DataType::INT32) {
-      if (overwrite) {
-        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *dO, *index, dX);
-      } else {
-        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
-      }
-    } else if (index_type == phi::DataType::INT64) {
-      if (overwrite) {
-        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX);
-      } else {
-        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index 21093f585b59e..f996b1ede2f0f 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
index 3dce380360815..b42050eabe300 100644
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -24,16 +24,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/gather_op.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(gather);
+USE_OP_ITSELF(gather);
 USE_OP_DEVICE_KERNEL(gather, NPU);
-USE_OP(gather_grad);
+USE_OP_ITSELF(gather_grad);
 USE_OP_DEVICE_KERNEL(gather_grad, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc
index 28f2f7d473bef..6c691aa14ae77 100644
--- a/paddle/fluid/operators/gather_op_xpu.cc
+++ b/paddle/fluid/operators/gather_op_xpu.cc
@@ -13,15 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/gather_op.h"
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 class GatherOpXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc
index 3d338f00d4fcb..3be2606bfc939 100644
--- a/paddle/fluid/operators/gelu_op.cc
+++ b/paddle/fluid/operators/gelu_op.cc
@@ -14,10 +14,11 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
-#include <unordered_map>
-
-#include "paddle/fluid/operators/gelu_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -29,18 +30,6 @@ class GeluOp : public framework::OperatorWithKernel {
          const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(%s) of GeluOp should not be null.", "X"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(%s) of GeluOp should not be null.", "Out"));
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -156,13 +145,10 @@ class GeluGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(gelu, GeluInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
 REGISTER_OPERATOR(gelu, ops::GeluOp, ops::GeluOpMaker,
                   ops::GeluGradOpMaker<paddle::framework::OpDesc>,
-                  ops::GeluGradOpMaker<paddle::imperative::OpBase>);
+                  ops::GeluGradOpMaker<paddle::imperative::OpBase>,
+                  GeluInferShapeFunctor);
 REGISTER_OPERATOR(gelu_grad, ops::GeluGradOp);
-REGISTER_OP_CPU_KERNEL(
-    gelu, ops::GeluKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GeluKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    gelu_grad, ops::GeluGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GeluGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
deleted file mode 100644
index ef836ab72f001..0000000000000
--- a/paddle/fluid/operators/gelu_op.cu
+++ /dev/null
@@ -1,320 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/gelu_op.h"
-
-DECLARE_bool(use_fast_math);
-
-namespace paddle {
-namespace operators {
-
-#ifdef __NVCC__
-template <bool FastMode>
-static __device__ __forceinline__ float FP32FastTanh(float x) {
-#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000
-  if (FastMode) {
-    float y;
-    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x));
-    return y;
-  }
-#endif
-  return tanhf(x);
-}
-
-template <bool FastMode>
-static __device__ __forceinline__ float FP32GeluFwd(float x) {
-  auto tanh_out =
-      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
-  return x * 0.5f * (1.0f + tanh_out);
-}
-
-template <bool FastMode>
-static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) {
-  auto tanh_out =
-      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
-  auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) *
-                         (0.79788456f + 0.1070322243f * x * x)) +
-             0.5f * (1.0f + tanh_out);
-  return tmp * y_g;
-}
-
-template <int VecSize, bool FastMode>
-static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y,
-                                                 size_t n) {
-  size_t offset =
-      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
-  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
-  for (; offset < n; offset += stride) {
-    using ArrT = phi::AlignedVector<__half, VecSize>;
-    ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      float tmp = __half2float(in_arr[i]);
-      in_arr[i] = __float2half(FP32GeluFwd<FastMode>(tmp));
-    }
-    *reinterpret_cast<ArrT*>(y + offset) = in_arr;
-  }
-}
-
-template <int VecSize, bool FastMode>
-static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
-                                                 const __half* y_g, __half* x_g,
-                                                 size_t n) {
-  size_t offset =
-      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
-  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
-  for (; offset < n; offset += stride) {
-    using ArrT = phi::AlignedVector<__half, VecSize>;
-    ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
-    ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      __half2 tmp_fp16_2;
-      tmp_fp16_2.x = x_in_arr[i];
-      tmp_fp16_2.y = y_g_in_arr[i];
-      float2 tmp_fp32_2 = __half22float2(tmp_fp16_2);
-      x_in_arr[i] =
-          __float2half(FP32GeluBwd<FastMode>(tmp_fp32_2.x, tmp_fp32_2.y));
-    }
-    *reinterpret_cast<ArrT*>(x_g + offset) = x_in_arr;
-  }
-}
-
-static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
-    const platform::CUDADeviceContext& dev_ctx, const __half* x, __half* y,
-    size_t n) {
-  auto is_aligned = [](const void* p, size_t alignment) {
-    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
-  };
-
-#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math)      \
-  do {                                                                        \
-    constexpr auto kAlignment =                                               \
-        alignof(phi::AlignedVector<__half, __vec_size>);                      \
-    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
-        is_aligned(y, kAlignment)) {                                          \
-      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
-      size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
-      VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
-               << " , thread = " << thread;                                   \
-      FP16FastGeluFwdCUDAKernel<                                              \
-          __vec_size,                                                         \
-          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y, n);  \
-      return true;                                                            \
-    }                                                                         \
-  } while (0)
-
-  if (FLAGS_use_fast_math) {
-    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true);
-  } else {
-    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false);
-  }
-
-#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL
-  return false;
-}
-
-static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
-    const platform::CUDADeviceContext& dev_ctx, const __half* x,
-    const __half* y_g, __half* x_g, size_t n) {
-  auto is_aligned = [](const void* p, size_t alignment) {
-    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
-  };
-
-#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math)      \
-  do {                                                                        \
-    constexpr auto kAlignment =                                               \
-        alignof(phi::AlignedVector<__half, __vec_size>);                      \
-    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
-        is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) &&           \
-        is_aligned(x_g, kAlignment)) {                                        \
-      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
-      size_t block = (n / __vec_size + thread - 1) / thread;                  \
-      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
-      VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
-               << " , thread = " << thread;                                   \
-      FP16FastGeluBwdCUDAKernel<                                              \
-          __vec_size,                                                         \
-          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y_g,    \
-                                                                   x_g, n);   \
-      return true;                                                            \
-    }                                                                         \
-  } while (0)
-
-  if (FLAGS_use_fast_math) {
-    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true);
-  } else {
-    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false);
-  }
-
-#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL
-  return false;
-}
-#endif
-
-template <typename T>
-struct GeluWithApproximateFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x) {
-    // this function is tanh approximation of gelu
-    MPType x = static_cast<MPType>(arg_x);
-    MPType one = static_cast<MPType>(1);
-    MPType half = static_cast<MPType>(0.5);
-    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
-    auto tanh_out =
-        tanh(kAlpha * x * (one + static_cast<MPType>(GELU_CONSTANT) * x * x));
-    MPType out = x * half * (one + tanh_out);
-    return static_cast<T>(out);
-  }
-};
-
-template <typename T>
-struct GeluWithoutApproximateFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x) {
-    // actual gelu with approximation = false
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(x * normcdf(x));
-  }
-};
-
-template <typename T>
-class GeluKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    auto approximate = context.Attr<bool>("approximate");
-    out->mutable_data<T>(in->place());
-
-    std::vector<const framework::Tensor*> ins = {in};
-    std::vector<framework::Tensor*> outs = {out};
-    const auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-
-    if (approximate) {
-#ifdef __NVCC__
-      if (std::is_same<T, platform::float16>::value) {
-        size_t n = in->numel();
-        const auto* in_ptr = reinterpret_cast<const __half*>(in->data<T>());
-        auto* out_ptr = reinterpret_cast<__half*>(out->data<T>());
-        if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(dev_ctx, in_ptr,
-                                                        out_ptr, n)) {
-          return;
-        }
-      }
-#endif
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
-    } else {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor<T>());
-    }
-  }
-};
-
-template <typename T>
-struct GeluWithApproximateGradFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType one = static_cast<MPType>(1);
-    MPType half = static_cast<MPType>(0.5);
-    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
-    MPType kBeta =
-        kAlpha * static_cast<MPType>(GELU_CONSTANT) * static_cast<MPType>(3);
-    auto cube_x = x * x * x;
-    auto tanh_out =
-        tanh(kAlpha * ((static_cast<MPType>(GELU_CONSTANT) * cube_x) + x));
-    auto ans =
-        half * (one + tanh_out +
-                (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x));
-    return static_cast<T>(ans * dout);
-  }
-};
-
-template <typename T>
-struct GeluWithoutApproximateGradFunctor {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
-    MPType x = static_cast<MPType>(arg_x);
-    MPType dout = static_cast<MPType>(arg_dout);
-    constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast<MPType>(0.5);
-    const MPType cdf = normcdf(x);
-    const MPType pdf = exp(static_cast<MPType>(-0.5) * x * x) * kBeta;
-    return static_cast<T>(dout * (cdf + x * pdf));
-  }
-};
-
-template <typename T>
-class GeluGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto approximate = context.Attr<bool>("approximate");
-    dx->mutable_data<T>(dout->place());
-
-    std::vector<const framework::Tensor*> ins = {x, dout};
-    std::vector<framework::Tensor*> outs = {dx};
-    const auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    if (approximate) {
-#ifdef __NVCC__
-      if (std::is_same<T, platform::float16>::value) {
-        size_t n = x->numel();
-        const auto* x_ptr = reinterpret_cast<const __half*>(x->data<T>());
-        const auto* y_g_ptr = reinterpret_cast<const __half*>(dout->data<T>());
-        auto* x_g_ptr = reinterpret_cast<__half*>(dx->data<T>());
-        if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(dev_ctx, x_ptr, y_g_ptr,
-                                                        x_g_ptr, n)) {
-          return;
-        }
-      }
-#endif
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor<T>());
-    } else {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor<T>());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    gelu, ops::GeluKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GeluKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::GeluKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    gelu_grad, ops::GeluGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GeluGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::GeluGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h
deleted file mode 100644
index d4fed8a868ff9..0000000000000
--- a/paddle/fluid/operators/gelu_op.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES
-#endif
-#include <algorithm>
-#include <cmath>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-#define GELU_CONSTANT 0.044715
-
-template <typename T>
-struct GeluFunctor {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out, bool approximate) const {
-    if (approximate) {
-      // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3})))
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto temp =
-            (static_cast<float>(M_2_SQRTPI * M_SQRT1_2) *
-             (casted_x + static_cast<float>(GELU_CONSTANT) * casted_x.cube()))
-                .tanh();
-        out.device(d) = (casted_x * static_cast<float>(0.5) *
-                         (static_cast<float>(1) + temp))
-                            .template cast<T>();
-      } else {
-        auto temp = (static_cast<T>(M_2_SQRTPI * M_SQRT1_2) *
-                     (x + static_cast<T>(GELU_CONSTANT) * x.cube()))
-                        .tanh();
-        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
-      }
-    } else {
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
-      auto x_data = x.data();
-      auto out_data = out.data();
-      int n = std::min(x.size(), out.size());
-
-      std::memset(out_data, 0, n * sizeof(T));
-      phi::funcs::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1,
-                                 out_data, 1);
-      phi::funcs::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
-      for (int i = 0; i < n; i++) {
-        out_data[i] += static_cast<T>(1);
-      }
-      phi::funcs::CBlas<T>::VMUL(n, x_data, out_data, out_data);
-      for (int i = 0; i < n; i++) {
-        out_data[i] *= static_cast<T>(0.5);
-      }
-#else
-      // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto temp = (casted_x * static_cast<float>(M_SQRT1_2)).erf();
-        out.device(d) = (casted_x * static_cast<float>(0.5) *
-                         (static_cast<float>(1) + temp))
-                            .template cast<T>();
-      } else {
-        auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
-        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
-      }
-#endif
-    }
-  }
-};
-
-template <typename T>
-struct GeluGradFunctor {
-  template <typename Device, typename X, typename dOut, typename dX>
-  void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const {
-    if (approximate) {
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto casted_dout = dout.template cast<float>();
-
-        const float kAlpha = static_cast<float>(M_2_SQRTPI * M_SQRT1_2);
-        const float kBeta =
-            kAlpha * static_cast<float>(GELU_CONSTANT) * static_cast<float>(3);
-        const auto y =
-            (kAlpha *
-             ((static_cast<float>(GELU_CONSTANT) * casted_x.cube()) + casted_x))
-                .tanh();
-        dx.device(d) = (static_cast<float>(0.5) * casted_dout *
-                        (static_cast<float>(1) + y +
-                         (casted_x - casted_x * y.square()) *
-                             (kAlpha + kBeta * casted_x.square())))
-                           .template cast<T>();
-      } else {
-        const T kAlpha = static_cast<T>(M_2_SQRTPI * M_SQRT1_2);
-        const T kBeta =
-            kAlpha * static_cast<T>(GELU_CONSTANT) * static_cast<T>(3);
-        const auto y =
-            (kAlpha * ((static_cast<T>(GELU_CONSTANT) * x.cube()) + x)).tanh();
-        dx.device(d) = static_cast<T>(0.5) * dout *
-                       (static_cast<T>(1) + y +
-                        (x - x * y.square()) * (kAlpha + kBeta * x.square()));
-      }
-    } else {
-#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
-      auto x_data = x.data();
-      auto dx_data = dx.data();
-      auto dout_data = dout.data();
-      int n = std::min(x.size(), dx.size());
-
-      auto first = static_cast<T*>(std::malloc(n * sizeof(T)));
-      std::memset(first, 0, n * sizeof(T));
-      auto second = static_cast<T*>(std::malloc(n * sizeof(T)));
-      std::memset(second, 0, n * sizeof(T));
-
-      // first = (0.5 * (1 + erf(x / sqrt(2))))
-      phi::funcs::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, first,
-                                 1);
-      phi::funcs::CBlas<T>::VMERF(n, first, first, VML_LA);
-      for (int i = 0; i < n; i++) {
-        first[i] += static_cast<T>(1);
-      }
-      phi::funcs::CBlas<T>::SCAL(n, static_cast<T>(0.5), first, 1);
-
-      // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2))
-      phi::funcs::CBlas<T>::VSQUARE(n, x_data, second);
-      phi::funcs::CBlas<T>::SCAL(n, -static_cast<T>(0.5), second, 1);
-      phi::funcs::CBlas<T>::VEXP(n, second, second);
-      phi::funcs::CBlas<T>::VMUL(n, x_data, second, second);
-      phi::funcs::CBlas<T>::SCAL(
-          n, static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1);
-
-      // dx = dout * (first + second);
-      phi::funcs::CBlas<T>::VADD(n, first, second, first);
-      phi::funcs::CBlas<T>::VMUL(n, dout_data, first, dx_data);
-
-      std::free(first);
-      std::free(second);
-#else
-      // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) *
-      // exp(- x^2 / 2)
-      if (std::is_same<T, platform::float16>::value) {
-        VLOG(4) << "cast from float16 to float before computing";
-        auto casted_x = x.template cast<float>();
-        auto casted_dout = dout.template cast<float>();
-        auto first = static_cast<float>(0.5) *
-                     (static_cast<float>(1) +
-                      ((casted_x * static_cast<float>(M_SQRT1_2)).erf()));
-        auto second = static_cast<float>(0.5 * M_2_SQRTPI * M_SQRT1_2) *
-                      casted_x *
-                      (-static_cast<float>(0.5) * casted_x.square()).exp();
-        dx.device(d) = (casted_dout * (first + second)).template cast<T>();
-      } else {
-        auto first =
-            static_cast<T>(0.5) *
-            (static_cast<T>(1) + ((x * static_cast<T>(M_SQRT1_2)).erf()));
-
-        auto second = static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
-                      (-static_cast<T>(0.5) * x.square()).exp();
-        dx.device(d) = dout * (first + second);
-      }
-#endif
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GeluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    auto approximate = context.Attr<bool>("approximate");
-    out->mutable_data<T>(in->place());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    GeluFunctor<T> functor;
-    functor(place, eigen_in, eigen_out, approximate);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GeluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto approximate = context.Attr<bool>("approximate");
-    dx->mutable_data<T>(dout->place());
-
-    auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-    auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
-    auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    GeluGradFunctor<T> functor;
-    functor(place, eigen_x, eigen_dout, eigen_dx, approximate);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
index 18bbc7f4929c6..c5297dd9cd404 100644
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/gelu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index f3ac53138328d..b132b3170756d 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(gelu);
+USE_OP_ITSELF(gelu);
 USE_OP_DEVICE_KERNEL(gelu, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc
index b8c2e9becf295..559d2448ad945 100644
--- a/paddle/fluid/operators/gelu_op_xpu.cc
+++ b/paddle/fluid/operators/gelu_op_xpu.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
-
-#include "paddle/fluid/operators/gelu_op.h"
-
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
index f7c006dbcb1a9..f67dea7402864 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -38,7 +38,7 @@ class GraphSendRecvGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto in_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
   }
 
@@ -68,6 +68,12 @@ class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
                          "tensors of Dst_index.")
         .SetDefault("SUM")
         .InEnum({"SUM", "MEAN", "MIN", "MAX"});
+    AddAttr<int64_t>(
+        "out_size",
+        "(int64_t, default 0)"
+        "Define the first dimension of Output tensor."
+        "If set default 0, then the shape of Out is the same with X.")
+        .SetDefault(0);
     AddComment(R"DOC(
 Graph Learning Send_Recv combine operator.
 
@@ -93,6 +99,7 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("graph_send_recv_grad");
     op->SetInput("Src_index", this->Input("Src_index"));
     op->SetInput("Dst_index", this->Input("Dst_index"));
+    op->SetInput("X", this->Input("X"));
 
     if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") {
       op->SetInput("Dst_count", this->Output("Dst_count"));
@@ -100,7 +107,6 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
 
     if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" ||
         BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") {
-      op->SetInput("X", this->Input("X"));
       op->SetInput("Out", this->Output("Out"));
     }
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 04aa6a3e10f6e..f6d3fd8984691 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -12,12 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/grid_sampler_op.h"
 #include <memory>
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -27,43 +31,6 @@ using Tensor = framework::Tensor;
 class GridSampleOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GridSampler");
-    OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "GridSampler");
-    OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "GridSampler");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto grid_dims = ctx->GetInputDim("Grid");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of GridSampleOp should be 4-D Tensor, but "
-                          "received X dimension size(%d)",
-                          x_dims.size()));
-    PADDLE_ENFORCE_EQ(grid_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input(Grid) of GridSampleOp should be 4-D Tensor, "
-                          "but received X dimension size(%d)",
-                          grid_dims.size()));
-    if (ctx->IsRuntime() || grid_dims[3] > 0) {
-      PADDLE_ENFORCE_EQ(
-          grid_dims[3], 2,
-          platform::errors::InvalidArgument(
-              "Input(Grid) dimension[3] should be 2, but received %d",
-              grid_dims[3]));
-    }
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          grid_dims[0], x_dims[0],
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Grid) dimension[0] should be equal, but "
-              "received X dimension[0](%d) != Grid dimension[0](%d)",
-              x_dims[0], grid_dims[0]));
-    }
-
-    ctx->SetOutputDim("Output",
-                      {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
-    ctx->ShareLoD("X", "Output");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -173,18 +140,6 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
 class GridSampleOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
-                   framework::GradVarName("X"), "grid_sampler");
-    auto input_dims = ctx->GetInputDim("X");
-    auto grid_dims = ctx->GetInputDim("Grid");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Grid"))) {
-      ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims);
-    }
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -224,19 +179,16 @@ class GridSampleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler, GridSamplerInferShapeFunctor,
+                            PD_INFER_META(phi::GridSampleBaseInferMeta));
 REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
                   ops::GridSampleGradMaker<paddle::framework::OpDesc>,
-                  ops::GridSampleGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    grid_sampler_grad,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+                  ops::GridSampleGradMaker<paddle::imperative::OpBase>,
+                  GridSamplerInferShapeFunctor);
+DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler_grad, GridSamplerGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralBinaryGradInferMeta));
+REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad,
+                  GridSamplerGradInferShapeFunctor);
 
 REGISTER_OP_VERSION(grid_sampler)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
deleted file mode 100644
index a227a8e312765..0000000000000
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ /dev/null
@@ -1,492 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/grid_sampler_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
-}
-
-template <typename T>
-static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH,
-                                                  int sW, int H, int W,
-                                                  T delta) {
-  if (in_bounds(h, w, H, W)) {
-    platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T _unnormalize(T coord, int size,
-                                                 bool align_corners) {
-  if (align_corners) {
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T clip_indexes(T in, int max_value) {
-  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
-}
-
-template <typename T>
-static __forceinline__ __device__ T reflect_indexes(T in, int twice_low,
-                                                    int twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<T>(0);
-  }
-  T min = static_cast<T>(twice_low) / 2;
-  T span = static_cast<T>(twice_high - twice_low) / 2;
-  in = fabs(in - min);
-  T extra = fmod(in, span);
-  int flips = static_cast<int>(floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T compute_positions(T coord, int size,
-                                                      PaddingMode padding_mode,
-                                                      bool align_corners) {
-  coord = _unnormalize<T>(coord, size, align_corners);
-  if (padding_mode == PaddingMode::border) {
-    coord = clip_indexes(coord, size - 1);
-  } else if (padding_mode == PaddingMode::reflect) {
-    if (align_corners) {
-      coord = reflect_indexes(coord, 0, 2 * (size - 1));
-    } else {
-      coord = reflect_indexes(coord, -1, 2 * size - 1);
-    }
-    coord = clip_indexes(coord, size - 1);
-  }
-  return coord;
-}
-
-template <typename T>
-static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size,
-                                                           bool align_corners,
-                                                           T* grad_in) {
-  if (align_corners) {
-    *grad_in = static_cast<T>(size - 1) / 2;
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    *grad_in = static_cast<T>(size) / 2;
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit,
-                                                           T* grad_in) {
-  if (in <= static_cast<T>(0)) {
-    *grad_in = static_cast<T>(0);
-    return static_cast<T>(0);
-  } else {
-    T max = static_cast<T>(clip_limit - 1);
-    if (in >= max) {
-      *grad_in = static_cast<T>(0);
-      return max;
-    } else {
-      *grad_in = static_cast<T>(1);
-      return in;
-    }
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T
-reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) {
-  if (twice_low == twice_high) {
-    *grad_in = static_cast<T>(0);
-    return static_cast<T>(0);
-  }
-  int grad_in_mult_;
-  T min = static_cast<T>(twice_low) / 2;
-  T span = static_cast<T>(twice_high - twice_low) / 2;
-  in = in - min;
-  if (in < static_cast<T>(0)) {
-    grad_in_mult_ = -1;
-    in = -in;
-  } else {
-    grad_in_mult_ = 1;
-  }
-  T extra = fmod(in, span);
-  int flips = static_cast<int>(floor(in / span));
-  if (flips % 2 == 0) {
-    *grad_in = static_cast<T>(grad_in_mult_);
-    return extra + min;
-  } else {
-    *grad_in = static_cast<T>(-grad_in_mult_);
-    return span - extra + min;
-  }
-}
-
-template <typename T>
-static __forceinline__ __device__ T
-compute_positions_with_mask(T coord, int size, PaddingMode padding_mode,
-                            bool align_corners, T* grad_in) {
-  T grad_clip, grad_refl;
-  coord = _unnormalize_with_mask<T>(coord, size, align_corners, grad_in);
-  if (padding_mode == PaddingMode::border) {
-    coord = clip_indexes_with_mask(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_clip;
-  } else if (padding_mode == PaddingMode::reflect) {
-    if (align_corners) {
-      coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl);
-    } else {
-      coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl);
-    }
-    coord = clip_indexes_with_mask(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_refl * grad_clip;
-  }
-
-  return coord;
-}
-
-template <typename T>
-__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
-                                        int out_h, int out_w, int in_h,
-                                        int in_w, const T* input, const T* grid,
-                                        T* output, const Mode mode,
-                                        const PaddingMode padding_mode,
-                                        bool align_corners) {
-  int inp_sN = out_c * in_h * in_w;
-
-  int inp_sC = in_h * in_w;
-  int inp_sH = in_w;
-  int inp_sW = 1;
-  int grid_sN = out_h * out_w * 2;
-  int grid_sH = out_w * 2;
-  int grid_sW = 2;
-  int grid_sCoor = 1;
-  int out_sN = out_c * out_h * out_w;
-  int out_sC = out_h * out_w;
-  int out_sH = out_w;
-  int out_sW = 1;
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_w;
-    const int h = (index / out_w) % out_h;
-    const int n = index / (out_h * out_w);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-    T ix = grid[grid_offset];
-    T iy = grid[grid_offset + grid_sCoor];
-
-    ix = compute_positions(ix, in_w, padding_mode, align_corners);
-    iy = compute_positions(iy, in_h, padding_mode, align_corners);
-    if (mode == Mode::bilinear) {
-      int ix_nw = static_cast<int>(floor(ix));
-      int iy_nw = static_cast<int>(floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-
-      T nw = (ix_se - ix) * (iy_se - iy);
-      T ne = (ix - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix) * (iy - iy_ne);
-      T se = (ix - ix_nw) * (iy - iy_nw);
-
-      auto inp_offset_NC = n * inp_sN;
-
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < out_c;
-           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        *out_ptr_NCHW = static_cast<T>(0);
-        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-        }
-        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-        }
-        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-        }
-        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
-          *out_ptr_NCHW +=
-              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
-        }
-      }
-    } else if (mode == Mode::nearest) {
-      int ix_nearest = static_cast<int>(std::nearbyint(ix));
-      int iy_nearest = static_cast<int>(std::nearbyint(iy));
-      auto inp_offset_NC = n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < out_c;
-           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) {
-          *out_ptr_NCHW =
-              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
-        } else {
-          *out_ptr_NCHW = static_cast<T>(0);
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
-    auto mode_s = ctx.Attr<std::string>("mode");
-    PaddingMode padding_mode;
-    Mode mode;
-    if (padding_mode_s == "border") {
-      padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflection") {
-      padding_mode = PaddingMode::reflect;
-    } else {
-      padding_mode = PaddingMode::zeros;
-    }
-
-    if (mode_s == "nearest") {
-      mode = Mode::nearest;
-    } else {
-      mode = Mode::bilinear;
-    }
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
-            << "; out_w: " << out_w;
-    auto* output = ctx.Output<Tensor>("Output");
-    auto* output_data = output->mutable_data<T>(ctx.GetPlace());
-    VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
-            << "; " << output->dims()[2] << "; " << output->dims()[3];
-    int count = static_cast<int>(n * out_h * out_w);
-    auto cu_stream = dev_ctx.stream();
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, count);
-    grid_sample_cuda_kernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-        count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
-        grid->data<T>(), output_data, mode, padding_mode, align_corners);
-  }
-};
-
-template <typename T>
-__global__ void grid_sampler_cuda_backward_kernel(
-    const int nthreads, const T* grad_output, const T* input, const T* grid,
-    int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input,
-    T* grad_grid, const Mode mode, const PaddingMode padding_mode,
-    bool align_corners) {
-  int inp_sN = out_c * in_h * in_w;
-  int inp_sC = in_h * in_w;
-  int inp_sH = in_w;
-  int inp_sW = 1;
-  int grid_sN = out_h * out_w * 2;
-  int grid_sH = out_w * 2;
-  int grid_sW = 2;
-  int grid_sCoor = 1;
-
-  int gOut_sN = out_c * out_h * out_w;
-  int gOut_sC = out_h * out_w;
-  int gOut_sH = out_w;
-  int gOut_sW = 1;
-
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_w;
-    const int h = (index / out_w) % out_h;
-    const int n = index / (out_h * out_w);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-    T ix = grid[grid_offset];
-    T iy = grid[grid_offset + grid_sCoor];
-
-    T gix_mult, giy_mult;
-    ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners,
-                                     &gix_mult);
-    iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners,
-                                     &giy_mult);
-
-    if (mode == Mode::bilinear) {
-      int ix_nw = static_cast<int>(floor(ix));
-      int iy_nw = static_cast<int>(floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-
-      T nw = (ix_se - ix) * (iy_se - iy);
-      T ne = (ix - ix_sw) * (iy_sw - iy);
-      T sw = (ix_ne - ix) * (iy - iy_ne);
-      T se = (ix - ix_nw) * (iy - iy_nw);
-
-      T gix = static_cast<T>(0), giy = static_cast<T>(0);
-      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
-      T* gInp_ptr_NC = grad_input + n * inp_sN;
-      int inp_offset_NC = n * inp_sN;
-      for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC,
-               gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
-        T gOut = grad_output[gOut_offset];
-
-        atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w,
-                   nw * gOut);
-        atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w,
-                   ne * gOut);
-        atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w,
-                   sw * gOut);
-        atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w,
-                   se * gOut);
-
-        if (in_bounds(iy_nw, ix_nw, in_h, in_w)) {
-          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
-          gix -= nw_val * (iy_se - iy) * gOut;
-          giy -= nw_val * (ix_se - ix) * gOut;
-        }
-        if (in_bounds(iy_ne, ix_ne, in_h, in_w)) {
-          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
-          gix += ne_val * (iy_sw - iy) * gOut;
-          giy -= ne_val * (ix - ix_sw) * gOut;
-        }
-        if (in_bounds(iy_sw, ix_sw, in_h, in_w)) {
-          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
-          gix -= sw_val * (iy - iy_ne) * gOut;
-          giy += sw_val * (ix_ne - ix) * gOut;
-        }
-        if (in_bounds(iy_se, ix_se, in_h, in_w)) {
-          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
-          gix += se_val * (iy - iy_nw) * gOut;
-          giy += se_val * (ix - ix_nw) * gOut;
-        }
-      }
-
-      if (grad_grid != nullptr) {
-        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
-        gGrid_ptr_NHW[0] = gix_mult * gix;
-        gGrid_ptr_NHW[1] = giy_mult * giy;
-      }
-    } else if (mode == Mode::nearest) {
-      int ix_nearest = static_cast<int>(std::nearbyint(ix));
-      int iy_nearest = static_cast<int>(std::nearbyint(iy));
-
-      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
-      T* gInp_ptr_NC = grad_input + n * inp_sN;
-      for (int c = 0; c < out_c;
-           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
-        atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h,
-                   in_w, grad_output[gOut_offset]);
-      }
-
-      if (grad_grid != nullptr) {
-        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
-        gGrid_ptr_NHW[0] = static_cast<T>(0);
-        gGrid_ptr_NHW[1] = static_cast<T>(0);
-      }
-    }
-  }
-}
-
-template <typename T>
-class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode_s = ctx.Attr<std::string>("padding_mode");
-    auto mode_s = ctx.Attr<std::string>("mode");
-
-    PaddingMode padding_mode;
-    Mode mode;
-    if (padding_mode_s == "border") {
-      padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflection") {
-      padding_mode = PaddingMode::reflect;
-    } else {
-      padding_mode = PaddingMode::zeros;
-    }
-
-    if (mode_s == "nearest") {
-      mode = Mode::nearest;
-    } else {
-      mode = Mode::bilinear;
-    }
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
-        ctx.template device_context<paddle::platform::CUDADeviceContext>(),
-        input_grad, static_cast<T>(0));
-
-    T* grid_grad_data = nullptr;
-    if (ctx.HasOutput(framework::GradVarName("Grid"))) {
-      auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-      grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
-    int count = static_cast<int>(n * out_h * out_w);
-    auto cu_stream = dev_ctx.stream();
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, count);
-    grid_sampler_cuda_backward_kernel<
-        T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
-        count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
-        out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
-        padding_mode, align_corners);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel<float>,
-                        ops::GridSampleOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(grid_sampler_grad,
-                        ops::GridSampleGradOpCUDAKernel<float>,
-                        ops::GridSampleGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
deleted file mode 100644
index 93e96694270a4..0000000000000
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ /dev/null
@@ -1,600 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iostream>
-#include <string>
-#include <utility>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-enum class Mode {
-  bilinear,
-  nearest,
-};
-
-enum class PaddingMode { zeros, border, reflect };
-
-using Tensor = framework::Tensor;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-
-using Array3 = Eigen::DSizes<int64_t, 3>;
-using Array4 = Eigen::DSizes<int64_t, 4>;
-
-template <typename T>
-static inline bool isInBound(T x, T y, T x_max, T y_max) {
-  if (x < 0 || x > x_max || y < 0 || y > y_max) {
-    return false;
-  }
-  return true;
-}
-
-template <typename T>
-static inline void unnormalize(const platform::CPUDeviceContext& ctx,
-                               Tensor* grid_slice,
-                               const int max_val,  // height-1 or width-1
-                               bool align_corners) {
-  auto& place = *ctx.eigen_device();
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-
-  if (!align_corners) {
-    auto factor = static_cast<T>((max_val + 1) * 0.5);
-    grid_slice_t.device(place) =
-        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
-  } else {
-    auto factor = static_cast<T>(max_val * 0.5);
-    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
-  }
-}
-
-template <typename T>
-static inline void clip(const platform::CPUDeviceContext& ctx,
-                        Tensor* grid_slice,
-                        const int max_val,  // height-1 or width-1
-                        bool align_corners, std::string padding_mode) {
-  auto& place = *ctx.eigen_device();
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-  if (padding_mode == "border") {
-    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
-                                     .cwiseMin(static_cast<T>(max_val));
-  } else if (padding_mode == "reflection") {
-    if (align_corners) {
-      auto double_range = static_cast<T>(max_val * 2);
-      auto grid_abs = grid_slice_t.abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
-      if (max_val == 0) {
-        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
-      }
-    } else {
-      auto double_range = static_cast<T>((max_val + 1) * 2);
-      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      grid_slice_t.device(place) =
-          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
-      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
-                                       .cwiseMin(static_cast<T>(max_val));
-    }
-  }
-}
-
-template <typename T>
-static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
-                                const int max_val,  // height-1 or width-1
-                                bool align_corners, std::string padding_mode,
-                                Tensor* grid_slice, Tensor* grid_scale) {
-  auto& place = *ctx.eigen_device();
-  grid_scale->mutable_data<T>(grid_slice->dims(), ctx.GetPlace());
-
-  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
-  auto factor = static_cast<T>(max_val * 0.5);
-  if (!align_corners) {
-    factor = static_cast<T>((max_val + 1) * 0.5);
-  }
-  auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
-
-  if (padding_mode == "border") {
-    //    auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
-    auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
-                   .cwiseMin(static_cast<T>(max_val));
-
-    auto in_bound = (res == grid_slice_t);
-    grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
-    grid_slice_t.device(place) = res;
-  } else if (padding_mode == "reflection") {
-    if (align_corners) {
-      auto double_range = static_cast<T>(max_val * 2);
-      auto is_neg = (grid_slice_t < static_cast<T>(0));
-      auto grid_abs = grid_slice_t.abs();
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      auto one_more_flip = (extra > (double_range - extra));
-      grid_scale_t.device(place) =
-          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
-                          (is_neg != one_more_flip).template cast<T>());
-      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
-      if (max_val == 0) {
-        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
-      }
-    } else {
-      auto double_range = static_cast<T>((max_val + 1) * 2);
-      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
-      auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
-      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
-      auto one_more_flip = (extra > (double_range - extra));
-      auto reflected =
-          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
-      auto clipped = reflected.cwiseMax(static_cast<T>(0))
-                         .cwiseMin(static_cast<T>(max_val));
-      auto in_bound = (clipped == reflected).template cast<T>();
-      grid_scale_t.device(place) =
-          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
-                          (is_neg != one_more_flip).template cast<T>()) *
-          in_bound;
-      grid_slice_t.device(place) = clipped;
-    }
-  }
-}
-
-template <typename T>
-static void calcGridLocations(const platform::CPUDeviceContext& ctx,
-                              const Tensor& grid, const int in_h,
-                              const int in_w, bool align_corners,
-                              std::string padding_mode, Tensor* grid_x,
-                              Tensor* grid_y) {
-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
-
-  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * out_h * out_w; i++) {
-    grid_x_data[i] = grid_data[2 * i];
-    grid_y_data[i] = grid_data[(2 * i) + 1];
-  }
-
-  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
-  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
-
-  clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
-  clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
-}
-
-template <typename T>
-static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx,
-                                      const Tensor& grid, const int in_h,
-                                      const int in_w, bool align_corners,
-                                      std::string padding_mode, Tensor* grid_x,
-                                      Tensor* grid_y, Tensor* grid_x_scale,
-                                      Tensor* grid_y_scale) {
-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
-
-  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
-  T* grid_x_data = grid_x->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  T* grid_y_data = grid_y->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-
-  const T* grid_data = grid.data<T>();
-  for (int i = 0; i < n * out_h * out_w; i++) {
-    grid_x_data[i] = grid_data[2 * i];
-    grid_y_data[i] = grid_data[(2 * i) + 1];
-  }
-
-  unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
-  unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
-
-  clipWithMask<T>(ctx, in_w - 1, align_corners, padding_mode, grid_x,
-                  grid_x_scale);
-  clipWithMask<T>(ctx, in_h - 1, align_corners, padding_mode, grid_y,
-                  grid_y_scale);
-}
-
-template <typename T>
-static void getGridPointValue(const Tensor& input, Tensor* output,
-                              const Tensor& x, const Tensor& y) {
-  const int n = input.dims()[0];
-  const int c = input.dims()[1];
-  const int in_h = input.dims()[2];
-  const int in_w = input.dims()[3];
-  const int out_h = x.dims()[1];
-  const int out_w = x.dims()[2];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
-  auto input_t = EigenTensor<T, 4>::From(input);
-
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            output_t(i, j, k, l) =
-                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                        static_cast<int>(round(x_t(i, k, l))));
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void allNeigbors(const platform::CPUDeviceContext& ctx,
-                        const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                        Tensor* x_w, Tensor* x_e, Tensor* y_n,
-                        Tensor* y_s,  // positions
-                        Tensor* d_w, Tensor* d_e, Tensor* d_n,
-                        Tensor* d_s,  // distance
-                        Tensor* v_wn, Tensor* v_en, Tensor* v_ws,
-                        Tensor* v_es) {  // values
-  auto& place = *ctx.eigen_device();
-
-  const int c = input.dims()[1];
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  // calculate coords of 4 corner points
-  x_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  x_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  y_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  y_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
-  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
-  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
-  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
-
-  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
-
-  x_w_t.device(place) = grid_x_t.floor();
-  x_e_t.device(place) = x_w_t + static_cast<T>(1);
-  y_n_t.device(place) = grid_y_t.floor();
-  y_s_t.device(place) = y_n_t + static_cast<T>(1);
-
-  // calculate distances to 4 sides
-  d_w->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_e->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_n->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  d_s->mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
-  d_w_t.device(place) = grid_x_t - x_w_t;
-  d_e_t.device(place) = x_e_t - grid_x_t;
-  d_n_t.device(place) = grid_y_t - y_n_t;
-  d_s_t.device(place) = y_s_t - grid_y_t;
-
-  // calc 4 corner points value
-  v_wn->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_en->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_ws->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  v_es->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-  getGridPointValue<T>(input, v_wn, *x_w, *y_n);
-  getGridPointValue<T>(input, v_en, *x_e, *y_n);
-  getGridPointValue<T>(input, v_ws, *x_w, *y_s);
-  getGridPointValue<T>(input, v_es, *x_e, *y_s);
-}
-
-template <typename T>
-static void bilinearInter(const platform::CPUDeviceContext& ctx,
-                          const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                          Tensor* out) {
-  auto& place = *ctx.eigen_device();
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  const int c = input.dims()[1];
-
-  Tensor x_w, x_e, y_n, y_s;
-  Tensor d_w, d_e, d_n, d_s;
-  Tensor v_wn, v_en, v_ws, v_es;
-
-  allNeigbors<T>(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e,
-                 &d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es);
-
-  auto d_w_t = EigenTensor<T, 3>::From(d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(d_s);
-
-  auto d_w_scaled_t =
-      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_e_scaled_t =
-      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_n_scaled_t =
-      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto d_s_scaled_t =
-      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
-  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-  auto v_en_t = EigenTensor<T, 4>::From(v_en);
-  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-  auto v_es_t = EigenTensor<T, 4>::From(v_es);
-  auto output_t = EigenTensor<T, 4>::From(*out);
-  // bilinear interpolaetion by 4 corner points
-  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
-                           v_en_t * d_w_scaled_t * d_s_scaled_t +
-                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
-                           v_es_t * d_w_scaled_t * d_n_scaled_t;
-}
-
-template <typename T>
-static void nearestInter(const platform::CPUDeviceContext& ctx,
-                         const Tensor& input, Tensor* grid_x, Tensor* grid_y,
-                         Tensor* out) {
-  auto& place = *ctx.eigen_device();
-
-  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
-  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
-  grid_x_t = grid_x_t.round();
-  grid_y_t = grid_y_t.round();
-  getGridPointValue<T>(input, out, *grid_x, *grid_y);
-}
-
-template <typename T>
-static void gatherOutputGradToInputGrad(const Tensor& output_grad,
-                                        Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y, const Tensor& d1,
-                                        const Tensor& d2) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_h = output_grad.dims()[2];
-  const int out_w = output_grad.dims()[3];
-  const int in_h = input_grad->dims()[2];
-  const int in_w = input_grad->dims()[3];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto d1_t = EigenTensor<T, 3>::From(d1);
-  auto d2_t = EigenTensor<T, 3>::From(d2);
-  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                         static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void gatherOutputGradToInputGrad(const Tensor& output_grad,
-                                        Tensor* input_grad, const Tensor& x,
-                                        const Tensor& y) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_h = output_grad.dims()[2];
-  const int out_w = output_grad.dims()[3];
-  const int in_h = input_grad->dims()[2];
-  const int in_w = input_grad->dims()[3];
-  auto x_t = EigenTensor<T, 3>::From(x);
-  auto y_t = EigenTensor<T, 3>::From(y);
-  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-  for (int i = 0; i < n; i++) {
-    for (int k = 0; k < out_h; k++) {
-      for (int l = 0; l < out_w; l++) {
-        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1),
-                      (T)(in_h - 1))) {
-          for (int j = 0; j < c; j++) {
-            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
-                         static_cast<int>(round(x_t(i, k, l)))) +=
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx,
-                               const Tensor& input, const Tensor& output_grad,
-                               Tensor* grid_x, Tensor* grid_y,
-                               Tensor* grid_x_scale, Tensor* grid_y_scale,
-                               Tensor* input_grad, Tensor* grid_grad) {
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  const int c = input.dims()[1];
-
-  Tensor x_w, x_e, y_n, y_s;
-  Tensor d_w, d_e, d_n, d_s;
-  Tensor v_wn, v_en, v_ws, v_es;
-
-  allNeigbors<T>(ctx, input,
-                 grid_x,  // grid_x
-                 grid_y,  // grid_y
-                 &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en,
-                 &v_ws, &v_es);
-
-  // gather output grad value to input grad by corner point coords and weight
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
-  gatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
-
-  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
-  auto v_en_t = EigenTensor<T, 4>::From(v_en);
-  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
-  auto v_es_t = EigenTensor<T, 4>::From(v_es);
-
-  auto d_w_t = EigenTensor<T, 3>::From(d_w);
-  auto d_e_t = EigenTensor<T, 3>::From(d_e);
-  auto d_n_t = EigenTensor<T, 3>::From(d_n);
-  auto d_s_t = EigenTensor<T, 3>::From(d_s);
-
-  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
-
-  if (grid_grad != nullptr) {
-    Tensor grid_grad_x, grid_grad_y;
-    grid_grad_x.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-    grid_grad_y.mutable_data<T>({n, out_h, out_w}, ctx.GetPlace());
-    auto grid_grad_x_t =
-        EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
-    auto grid_grad_y_t =
-        EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < c; j++) {
-        for (int k = 0; k < out_h; k++) {
-          for (int l = 0; l < out_w; l++) {
-            grid_grad_x_t(i, k, l) +=
-                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-            grid_grad_y_t(i, k, l) +=
-                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
-                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
-                output_grad_t(i, j, k, l);
-          }
-        }
-      }
-    }
-
-    //  const T x_max = static_cast<T>(in_w - 1);
-    //  const T y_max = static_cast<T>(in_h - 1);
-
-    auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
-    auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
-    grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
-    grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
-
-    // gather grid_grad [x, y] in 3rd Dim
-    T* grid_grad_data = grid_grad->data<T>();
-    T* grid_grad_x_data = grid_grad_x.data<T>();
-    T* grid_grad_y_data = grid_grad_y.data<T>();
-    for (int i = 0; i < n * out_h * out_w; i++) {
-      grid_grad_data[2 * i] = grid_grad_x_data[i];
-      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class GridSampleOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode = ctx.Attr<std::string>("padding_mode");
-    auto mode = ctx.Attr<std::string>("mode");
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
-    auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), output,
-        static_cast<T>(0));
-
-    Tensor grid_x, grid_y;
-    calcGridLocations<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
-        in_w, align_corners, padding_mode, &grid_x, &grid_y);
-    if (mode == "bilinear") {
-      bilinearInter<T>(
-          ctx.template device_context<platform::CPUDeviceContext>(), *input,
-          &grid_x, &grid_y, output);
-    } else if (mode == "nearest") {
-      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-      grid_x_t = grid_x_t.round();
-      grid_y_t = grid_y_t.round();
-      getGridPointValue<T>(*input, output, grid_x, grid_y);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GridSampleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto align_corners = ctx.Attr<bool>("align_corners");
-    auto padding_mode = ctx.Attr<std::string>("padding_mode");
-    auto mode = ctx.Attr<std::string>("mode");
-
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-
-    const int n = grid->dims()[0];
-    const int out_h = grid->dims()[1];
-    const int out_w = grid->dims()[2];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), input_grad,
-        static_cast<T>(0));
-
-    Tensor* grid_grad = nullptr;
-    if (ctx.HasOutput(framework::GradVarName("Grid"))) {
-      grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-      grid_grad->mutable_data<T>({n, out_h, out_w, 2}, ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), grid_grad,
-          static_cast<T>(0));
-    }
-
-    Tensor grid_x, grid_y;
-    Tensor grid_x_scale, grid_y_scale;
-    calcGridLocationsWithGrad<T>(
-        ctx.template device_context<platform::CPUDeviceContext>(), *grid, in_h,
-        in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale,
-        &grid_y_scale);
-    if (mode == "bilinear") {
-      gatherBilinearGrad<T>(ctx.template device_context<DeviceContext>(),
-                            *input, *output_grad, &grid_x, &grid_y,
-                            &grid_x_scale, &grid_y_scale, input_grad,
-                            grid_grad);
-    } else {
-      auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
-      auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
-      grid_x_t = grid_x_t.round();
-      grid_y_t = grid_y_t.round();
-      gatherOutputGradToInputGrad<T>(*output_grad, input_grad, grid_x, grid_y);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index ab8c50d90b8ec..c08f1920205da 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -152,6 +152,21 @@ __device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
   }
 }
 
+template <typename T>
+__device__ __forceinline__ void ReduceMeanAndVar(T* mean, T* var, T x_mean,
+                                                 T x_var, int size) {
+  const int nc = blockIdx.x;
+  x_mean = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
+      x_mean, kps::AddFunctor<T>());
+  x_var = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
+      x_var, kps::AddFunctor<T>());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    mean[nc] = static_cast<T>(x_mean / size);
+    var[nc] = static_cast<T>(x_var / size);
+  }
+}
+
 template <typename T>
 __global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
   int i = blockIdx.x;
@@ -162,10 +177,7 @@ __global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
     x_mean += val;
     x_var += val * val;
   }
-  x_mean /= size;
-  x_var /= size;
-  CudaAtomicAddWithWarp(&mean[i], x_mean);
-  CudaAtomicAddWithWarp(&var[i], x_var);
+  ReduceMeanAndVar<T>(mean, var, x_mean, x_var, size);
 }
 
 template <typename T, typename AccT, int VecSize>
@@ -174,21 +186,12 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
   int i = blockIdx.x;
   AccT x_mean = static_cast<AccT>(0);
   AccT x_var = static_cast<AccT>(0);
-  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
   x += i * size;
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
   phi::Array<const T*, 1> ins;
   ins[0] = x;
   ThreadReduce<T, AccT, VecSize, 1>(ins, size, input_offset, &x_mean, &x_var);
-
-  x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      x_mean, kps::AddFunctor<AccT>());
-  x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      x_var, kps::AddFunctor<AccT>());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    mean[i] = static_cast<T>(x_mean / size);
-    var[i] = static_cast<T>(x_var / size);
-  }
+  ReduceMeanAndVar<AccT>(mean, var, x_mean, x_var, size);
 }
 
 template <typename T, int flags>
@@ -272,10 +275,6 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     Tensor temp_var;
     temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
-
-    set_zero(dev_ctx, mean, static_cast<T>(0));
-    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
-
     auto* x_data = x->data<T>();
     auto* y_data = y->data<T>();
     auto* mean_data = mean->data<T>();
@@ -319,7 +318,7 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
       block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
       dim3 grids(x_dims[0] * groups);
       dim3 blocks(block_size_nchw);
-      if (size < vec_size) {
+      if (size < vec_size * block_size_nchw) {
         ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
             x_data, mean_data, temp_var_data, size);
       } else {
@@ -328,6 +327,8 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
             x_data, mean_data, temp_var_data, size);
       }
     } else {
+      set_zero(dev_ctx, mean, static_cast<T>(0));
+      set_zero(dev_ctx, &temp_var, static_cast<T>(0));
       GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
           x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
           temp_var_data);
@@ -424,24 +425,15 @@ __global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
   int i = blockIdx.x;
   AccT ds_sum = static_cast<AccT>(0);
   AccT db_sum = static_cast<AccT>(0);
-  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
   x += i * imsize;
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
 
   phi::Array<const T*, 2> ins;
   ins[0] = x;
   ins[1] = dy;
   ThreadReduce<T, AccT, VecSize, 2>(ins, imsize, input_offset, &db_sum,
                                     &ds_sum);
-
-  ds_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      ds_sum, kps::AddFunctor<AccT>());
-  db_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      db_sum, kps::AddFunctor<AccT>());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    ds[i] = ds_sum;
-    db[i] = db_sum;
-  }
+  ReduceMeanAndVar<AccT>(db, ds, db_sum, ds_sum, 1);
 }
 
 template <typename T>
@@ -455,8 +447,7 @@ __global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
     ds_sum += dy[index] * x[index];
     db_sum += dy[index];
   }
-  CudaAtomicAddWithWarp(&ds[nc], ds_sum);
-  CudaAtomicAddWithWarp(&db[nc], db_sum);
+  ReduceMeanAndVar<T>(db, ds, db_sum, ds_sum, 1);
 }
 
 template <typename T>
@@ -641,13 +632,7 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
       }
       block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
       dim3 blocks(block_size_nchw);
-      if (imsize < vec_size) {
-        if (d_scale) {
-          set_zero(dev_ctx, d_scale, static_cast<T>(0));
-        }
-        if (d_bias) {
-          set_zero(dev_ctx, d_bias, static_cast<T>(0));
-        }
+      if (imsize < vec_size * block_size_nchw) {
         ScalarGetDsDbCUDAKernel<
             T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
             imsize, x_data, dy_data, ds_data, db_data);
@@ -687,7 +672,6 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
             imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data,
             dy_data, d_x_data);
       }
-
     } else {
       if (d_scale) {
         set_zero(dev_ctx, d_scale, static_cast<T>(0));
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 9575ab54b32bd..93f0d3d334f27 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
 #include <string>
 #include <vector>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/multiary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -60,31 +64,6 @@ namespace operators {
 class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "hsigmoid");
-    OP_INOUT_CHECK(ctx->HasOutput("PreOut"), "Output", "PreOut", "hsigmoid");
-
-    auto with_prefetch = ctx->Attrs().Get<bool>("remote_prefetch");
-    if (with_prefetch) {
-      OP_INOUT_CHECK(ctx->HasOutput("W_Out"), "Output", "W_Out", "hsigmoid");
-    }
-    const int64_t input_dims = ctx->GetInputDim("X")[0];
-    const int64_t label_dims = ctx->GetInputDim("Label")[0];
-    PADDLE_ENFORCE_EQ(input_dims, label_dims,
-                      platform::errors::InvalidArgument(
-                          "The first dimension of "
-                          "input and label is expected to be the same. "
-                          "But received input's first dimension is %d; "
-                          "label's first dimension is %d.",
-                          input_dims, label_dims));
-
-    std::vector<int64_t> output_shape({input_dims, 1});
-    ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -272,22 +251,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
-    ops::HierarchicalSigmoidOpMaker<int>,
-    ops::HierarchicalSigmoidGradMaker<paddle::framework::OpDesc>,
-    ops::HierarchicalSigmoidGradMaker<paddle::imperative::OpBase>);
+DECLARE_INFER_SHAPE_FUNCTOR(hierarchical_sigmoid,
+                            HierarchicalSigmoidInferShapeFunctor,
+                            PD_INFER_META(phi::HierarchicalSigmoidInferMeta));
+REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
+                  ops::HierarchicalSigmoidOpMaker<int>,
+                  ops::HierarchicalSigmoidGradMaker<paddle::framework::OpDesc>,
+                  ops::HierarchicalSigmoidGradMaker<paddle::imperative::OpBase>,
+                  HierarchicalSigmoidInferShapeFunctor);
 REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp,
                   ops::HierarchicalSigmoidGradOpGradVarTypeInference,
                   ops::HierarchicalSigmoidGradOpNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    hierarchical_sigmoid,
-    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
-REGISTER_OP_CPU_KERNEL(
-    hierarchical_sigmoid_grad,
-    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
-                                         float>,
-    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
-                                         double>);
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
deleted file mode 100644
index f11b28cfefb07..0000000000000
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/clip_op.h"
-#include "paddle/fluid/operators/math/matrix_bit_code.h"
-#include "paddle/fluid/platform/transform.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-using platform::Transform;
-using framework::LoDTensor;
-
-static std::vector<int64_t> PathToRows(const LoDTensor& path) {
-  std::set<int64_t> rows;
-  const int64_t* paths = path.data<int64_t>();
-  for (int64_t i = 0; i < path.numel(); ++i) {
-    int64_t row = paths[i];
-    if (row < 0) {
-      continue;
-    }
-    rows.emplace(row);
-  }
-  return std::vector<int64_t>(rows.begin(), rows.end());
-}
-template <typename DeviceContext, typename T>
-class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& in = GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X",
-                               "HierarchicalSigmoid");
-    auto& w = GET_DATA_SAFELY(ctx.Input<LoDTensor>("W"), "Input", "W",
-                              "HierarchicalSigmoid");
-    auto* path = ctx.Input<LoDTensor>("PathTable");
-    auto* code = ctx.Input<LoDTensor>("PathCode");
-    auto& label = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Label"), "Input",
-                                  "Label", "HierarchicalSigmoid");
-    auto* bias = ctx.Input<LoDTensor>("Bias");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    auto* pre_out = ctx.Output<LoDTensor>("PreOut");
-    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-    // for remote prefetch
-
-    bool is_custom = false;
-    if (path) {
-      is_custom = true;
-    }
-    int64_t code_length =
-        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
-    int64_t batch_size = in.dims()[0];
-    LoDTensor sum;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* pre_out_data = pre_out->mutable_data<T>(
-        phi::make_ddim({batch_size, code_length}), ctx.GetPlace());
-    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
-    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
-    // 0s can avoid out of path's loss.
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    zero(dev_ctx, pre_out, static_cast<T>(0.0));
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    phi::funcs::RowwiseSum<DeviceContext, T> row_sum;
-
-    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
-    if (!is_custom) {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          num_classes, label.template data<int64_t>()));
-    } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          *path, *code, label.template data<int64_t>()));
-    }
-
-    std::vector<int64_t> sum_dims({batch_size, 1UL});
-    sum.mutable_data<T>(phi::make_ddim(sum_dims), ctx.GetPlace());
-    auto sum_mat = EigenMatrix<T>::From(sum);
-    out->mutable_data<T>(ctx.GetPlace());
-    auto out_mat = framework::EigenMatrix<T>::From(*out);
-    if (bias) {
-      bit_code->Add(*bias, pre_out);
-    }
-    bit_code->Mul(pre_out, w, in);
-    // clip to [-40, 40]
-    Transform<DeviceContext> trans;
-    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
-          pre_out_data + pre_out->numel(), pre_out_data,
-          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
-    bit_code->Sum(*pre_out, out, static_cast<T>(-1));
-    // use softrelu to calculate cross entropy
-    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
-    row_sum(dev_ctx, *pre_out, &sum);
-    // TODO(guosheng): Subtract the out of path's loss, since not all
-    // class(leaf) nodes' path lengths equal code_length. But it won't break the
-    // gradient check since both have the out of path's loss and will cancel out
-    // each other.
-    out_mat.device(place) = sum_mat + out_mat;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& in = GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X",
-                               "HierarchicalSigmoidGrad");
-    auto& w = GET_DATA_SAFELY(ctx.Input<LoDTensor>("W"), "Input", "W",
-                              "HierarchicalSigmoidGrad");
-    auto* path = ctx.Input<LoDTensor>("PathTable");
-    auto* code = ctx.Input<LoDTensor>("PathCode");
-    auto* in_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    bool is_sparse = ctx.Attr<bool>("is_sparse");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> zero;
-    auto& label = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Label"), "Input",
-                                  "Label", "HierarchicalSigmoidGrad");
-    auto& pre_out = GET_DATA_SAFELY(ctx.Input<LoDTensor>("PreOut"), "Input",
-                                    "PreOut", "HierarchicalSigmoidGrad");
-    auto& out_grad = GET_DATA_SAFELY(
-        ctx.Input<LoDTensor>(framework::GradVarName("Out")), "Input",
-        framework::GradVarName("Out"), "HierarchicalSigmoidGrad");
-    LoDTensor pre_out_grad;
-
-    pre_out_grad.mutable_data<T>(pre_out.dims(), ctx.GetPlace());
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    zero(dev_ctx, in_grad, static_cast<T>(0.0));
-
-    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-
-    bool is_custom = false;
-    if (path) {
-      is_custom = true;
-    }
-
-    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
-    if (!is_custom) {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          num_classes, label.template data<int64_t>()));
-    } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
-          *path, *code, label.template data<int64_t>()));
-    }
-
-    // softrelu derivative
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-
-    auto* pre_out_grad_data = pre_out_grad.data<T>();
-    auto* pre_out_data = pre_out.template data<T>();
-    auto n = pre_out.numel();
-    blas.VEXP(n, pre_out_data, pre_out_grad_data);
-    blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
-    for (int64_t i = 0; i < n; ++i) {
-      pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
-    }
-    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
-    auto* out_grad_data = out_grad.template data<T>();
-
-    int64_t dim0 = pre_out_grad.dims()[0];
-    int64_t dim1 = pre_out_grad.dims()[1];
-    for (int64_t i = 0; i < dim0; ++i) {
-      T tmp = out_grad_data[i];
-      blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
-    }
-    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
-    // be consistent with the clipping in forward.
-    auto* bias_grad = ctx.Output<LoDTensor>(framework::GradVarName("Bias"));
-    if (bias_grad) {
-      bias_grad->mutable_data<T>(ctx.GetPlace());
-      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
-      bit_code->AddGrad(pre_out_grad, bias_grad);
-    }
-    if (!is_sparse) {
-      auto* w_grad = ctx.Output<LoDTensor>(framework::GradVarName("W"));
-      w_grad->mutable_data<T>(ctx.GetPlace());
-      zero(dev_ctx, w_grad, static_cast<T>(0.0));
-      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
-    } else {
-      PADDLE_ENFORCE_NOT_NULL(path,
-                              platform::errors::NotFound(
-                                  "Custom tree must be set for sparse mode!"));
-      framework::Vector<int64_t> real_rows = PathToRows(*path);
-      auto* w_grad = ctx.Output<phi::SelectedRows>(framework::GradVarName("W"));
-      w_grad->set_rows(real_rows);
-      // Build a map of id -> row_index to speed up finding the index of one id
-      w_grad->set_height(w.dims()[0]);
-      auto* w_grad_value = w_grad->mutable_value();
-      framework::DDim temp_dim(w.dims());
-      temp_dim[0] = real_rows.size();
-      w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
-      zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
-      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
-    }
-    bit_code->MulGradError(pre_out_grad, w, in_grad);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/histogram_op.cc b/paddle/fluid/operators/histogram_op.cc
index 92cc6077defcd..c9fd75651b589 100644
--- a/paddle/fluid/operators/histogram_op.cc
+++ b/paddle/fluid/operators/histogram_op.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -28,27 +30,6 @@ class HistogramOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "histogram");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "histogram");
-    const auto &nbins = ctx->Attrs().Get<int64_t>("bins");
-    const auto &minval = ctx->Attrs().Get<int>("min");
-    const auto &maxval = ctx->Attrs().Get<int>("max");
-
-    PADDLE_ENFORCE_GE(nbins, 1,
-                      platform::errors::InvalidArgument(
-                          "The bins should be greater than or equal to 1."
-                          "But received nbins is %d",
-                          nbins));
-    PADDLE_ENFORCE_GE(maxval, minval, platform::errors::InvalidArgument(
-                                          "max must be larger or equal to min."
-                                          "But received max is %d, min is %d",
-                                          maxval, minval));
-
-    ctx->SetOutputDim("Out", phi::make_ddim({nbins}));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
@@ -81,7 +62,12 @@ class HistogramOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(histogram, HistogramInferShapeFunctor,
+                            PD_INFER_META(phi::HistogramInferMeta));
+
 REGISTER_OPERATOR(
     histogram, ops::HistogramOp, ops::HistogramOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    HistogramInferShapeFunctor);
diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc
index fea71edf41313..069cc9416a620 100644
--- a/paddle/fluid/operators/index_select_op.cc
+++ b/paddle/fluid/operators/index_select_op.cc
@@ -13,8 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/index_select_op.h"
+
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of IndexSelectOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) of IndexSelectOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of IndexSelectOp should not be null."));
-
-    auto input_dim = ctx->GetInputDim("X");
-    auto index_dim = ctx->GetInputDim("Index");
-    auto dim = ctx->Attrs().Get<int>("dim");
-
-    PADDLE_ENFORCE_EQ(
-        dim < input_dim.size() && dim >= (0 - input_dim.size()), true,
-        platform::errors::OutOfRange(
-            "Attr(dim) is out of range, It's expected "
-            "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-            input_dim.size(), input_dim.size() - 1, dim));
-
-    PADDLE_ENFORCE_EQ(
-        index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
-        true, platform::errors::InvalidArgument(
-                  "The 'shape' of Input(Index) must be 1-D tensor. "
-                  "But received: the 'shape' of Input(Index) is [%s], "
-                  "the dimension of Input(Index) is [%d].",
-                  index_dim, index_dim.size()));
-
-    PADDLE_ENFORCE_EQ(index_dim[0] != 0, true,
-                      platform::errors::InvalidArgument(
-                          "The length of Input(Index) can't be 0."));
-
-    auto output_dim = phi::vectorize(input_dim);
-    if (dim < 0) {
-      dim += input_dim.size();
-    }
-    output_dim[dim] = index_dim[0];
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dim));
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(index_select, IndexSelectInferShapeFunctor,
+                            PD_INFER_META(phi::IndexSelectInferMeta));
 REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker,
                   ops::IndexSelectGradMaker<paddle::framework::OpDesc>,
-                  ops::IndexSelectGradMaker<paddle::imperative::OpBase>);
+                  ops::IndexSelectGradMaker<paddle::imperative::OpBase>,
+                  IndexSelectInferShapeFunctor);
 REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp,
                   ops::IndexSelectGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    index_select,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSelectKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSelectGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/index_select_op.cu b/paddle/fluid/operators/index_select_op.cu
deleted file mode 100644
index f810aee2adea5..0000000000000
--- a/paddle/fluid/operators/index_select_op.cu
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/index_select_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, typename IndexT>
-__global__ void index_select_cuda_kernel(const T* input, T* output,
-                                         const IndexT* index, int64_t N,
-                                         int64_t stride, int64_t size,
-                                         int64_t delta) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int64_t pre_idx = idx / (stride * size);
-  int64_t dim_idx = idx % (stride * size) / stride;
-  IndexT src_dim_idx = index[dim_idx];
-  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  output[idx] = input[input_idx];
-}
-
-template <typename T, typename IndexT>
-__global__ void index_select_grad_cuda_kernel(const T* output_grad,
-                                              T* input_grad,
-                                              const IndexT* index, int64_t nums,
-                                              int64_t N, int64_t stride,
-                                              int64_t size, int64_t delta) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int64_t pre_idx = idx / (stride * size);
-  int64_t dim_idx = idx % (stride * size) / stride;
-  IndexT src_dim_idx = index[dim_idx];
-  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
-}
-
-template <typename T>
-__global__ void index_select_grad_init(T* input_grad, int64_t N) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  input_grad[idx] = 0.0;
-}
-
-template <typename DeviceContext, typename T>
-class IndexSelectCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* index = context.Input<LoDTensor>("Index");
-    auto* out = context.Output<LoDTensor>("Out");
-    int dim = context.Attr<int>("dim");
-    auto input_dim = in->dims();
-    auto output_dim = out->dims();
-    dim = dim >= 0 ? dim : dim + input_dim.size();
-    auto stride_dim = phi::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    int64_t size = output_dim[dim];
-    int64_t delta = input_dim[dim] - size;
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = out->numel();
-
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      index_select_cuda_kernel<T, int64_t><<<
-          (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
-                                                numel, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    } else {
-      const int* index_data = index->data<int>();
-      index_select_cuda_kernel<T, int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                                             PADDLE_CUDA_NUM_THREADS,
-                                         PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-          in_data, out_data, index_data, numel, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* output_grad = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* in_grad = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* index = context.Input<LoDTensor>("Index");
-
-    auto* output_grad_data = output_grad->data<T>();
-    auto* in_grad_data = in_grad->mutable_data<T>(context.GetPlace());
-
-    int dim = context.Attr<int>("dim");
-    auto input_dim = in_grad->dims();
-    auto output_dim = output_grad->dims();
-    dim = dim >= 0 ? dim : dim + input_dim.size();
-    auto stride_dim = phi::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    int64_t size = output_dim[dim];
-    int64_t delta = input_dim[dim] - size;
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    int64_t numel = in_grad->numel();
-    int64_t index_nums = index->numel();
-    int64_t out_nums = output_grad->numel();
-
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    index_select_grad_init<
-        T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      index_select_grad_cuda_kernel<T, int64_t><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    } else {
-      const int* index_data = index->data<int>();
-      index_select_grad_cuda_kernel<T, int><<<
-          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums,
-                                                out_nums, stride, size, delta);
-      platform::GpuStreamSync(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    index_select,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext,
-                               paddle::platform::float16>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index 04b4f69add785..684829be2697c 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context,
   output->Resize(output_dim);
 }
 
-template <typename DeviceContext, typename T>
-class IndexSelectKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto inputs = *context.Input<framework::LoDTensor>("X");
-    auto* index = context.Input<framework::LoDTensor>("Index");
-    auto* output = context.Output<framework::LoDTensor>("Out");
-
-    int dim = context.Attr<int>("dim");
-    if (dim < 0) {
-      dim += inputs.dims().size();
-    }
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSelectInner<DeviceContext, T, int>(context, &inputs, *index, output,
-                                              dim);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSelectInner<DeviceContext, T, int64_t>(context, &inputs, *index,
-                                                  output, dim);
-    }
-  }
-};
-
 template <typename DeviceContext, typename T, class Enable = void>
 struct IndexSelectAdd {
   void operator()(const framework::ExecutionContext& ctx, int slice_size,
@@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context,
   x_grad->Resize(output_dim);
 }
 
-template <typename DeviceContext, typename T>
-class IndexSelectGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_grad =
-        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* index = context.Input<framework::LoDTensor>("Index");
-    auto* out_grad =
-        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-
-    int dim = context.Attr<int>("dim");
-    if (dim < 0) {
-      dim += out_grad->dims().size();
-    }
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSelectGradInner<DeviceContext, T, int>(context, *out_grad, *index,
-                                                  x_grad, dim);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSelectGradInner<DeviceContext, T, int64_t>(context, *out_grad,
-                                                      *index, x_grad, dim);
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc
index bce7a3c1caae3..a232fba7e28d6 100644
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/index_select_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename DeviceContext, typename T>
 class IndexSelectNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 7f5136969980b..77951ff394e74 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -323,6 +323,7 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OPERATOR(inplace_abn, ops::InplaceABNOp, ops::InplaceABNOpMaker,
                   ops::BatchNormOpInferVarType,
                   ops::InplaceABNOpGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc
index 0ae7a9fa02f1f..1c79213757fdf 100644
--- a/paddle/fluid/operators/isclose_op.cc
+++ b/paddle/fluid/operators/isclose_op.cc
@@ -12,56 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/isclose_op.h"
 #include <cmath>
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct GetTensorValue<platform::CPUDeviceContext, T> {
-  T operator()(const platform::CPUDeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    return *(tensor.data<T>());
-  }
-};
-
-template <typename T>
-struct IscloseFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    auto* in_a = in.data<T>();
-    auto* in_b = other.data<T>();
-    auto* out_data = output->mutable_data<bool>(ctx.GetPlace());
-    auto num = in.numel();
-    // *out_data = true;
-    for (int i = 0; i < num; i++) {
-      out_data[i] = true;
-    }
-    for (int i = 0; i < num; i++) {
-      const T a = in_a[i], b = in_b[i];
-      bool val;
-      if (std::isnan(a) || std::isnan(b)) {
-        val = equal_nan && std::isnan(a) == std::isnan(b);
-      } else {
-        T left = (a > b ? a - b : b - a);
-        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-        T diff = (left > right ? left - right : right - left);
-        val = a == b || left <= right || diff <= 1e-15;
-      }
-      // *out_data &= val;
-      out_data[i] = val;
-    }
-  }
-};
-
 class IscloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -100,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Isclose");
-    OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Isclose");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Isclose");
-
-    auto input_dim = ctx->GetInputDim("Input");
-    auto other_dim = ctx->GetInputDim("Other");
-    PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Input) and Input(Other) must have the same "
-                          "dimension size."));
-    int n = input_dim.size();
-    bool is_runtime = ctx->IsRuntime();
-    for (int i = 0; i < n; i++) {
-      if (is_runtime) {
-        PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                          platform::errors::PreconditionNotMet(
-                              "The value at dim %d of Input(Input) is not "
-                              "equal to the Input(Other): %ld != %ld.",
-                              i, input_dim[i], other_dim[i]));
-      } else {
-        if (!(input_dim[i] < 0 || other_dim[i] < 0)) {
-          PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i],
-                            platform::errors::PreconditionNotMet(
-                                "The value at dim %d of Input(Input) is not "
-                                "equal to the Input(Other): %ld != %ld.",
-                                i, input_dim[i], other_dim[i]));
-        }
-      }
-    }
-
-    ctx->SetOutputDim("Out", input_dim);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -154,12 +83,11 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(isclose, IscloseInferShapeFunctor,
+                            PD_INFER_META(phi::ValueCompareInferMeta));
 REGISTER_OPERATOR(
     isclose, ops::IscloseOp, ops::IscloseOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::IscloseOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(isclose, ops::IscloseKernel<CPU, float>,
-                       ops::IscloseKernel<CPU, double>);
+    ops::IscloseOpVarTypeInference, IscloseInferShapeFunctor);
diff --git a/paddle/fluid/operators/isclose_op.cu b/paddle/fluid/operators/isclose_op.cu
deleted file mode 100644
index 09710ba0c6957..0000000000000
--- a/paddle/fluid/operators/isclose_op.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/isclose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GetTensorValue<platform::CUDADeviceContext, T> {
-  T operator()(const platform::CUDADeviceContext& dev_ctx,
-               const framework::Tensor& tensor) const {
-    const T* data = tensor.data<T>();
-    T value;
-    const auto gpu_place = dev_ctx.GetPlace();
-    memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
-                 dev_ctx.stream());
-    return value;
-  }
-};
-
-template <typename T>
-__global__ void IscloseCUDAKernel(const T* in_data, const T* other_data,
-                                  const double rtol, const double atol,
-                                  bool equal_nan, int num, bool* out_data) {
-  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  bool val;
-  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
-    const T a = in_data[i], b = other_data[i];
-    if (isnan(a) || isnan(b)) {
-      val = equal_nan && isnan(a) == isnan(b);
-    } else {
-      T left = (a > b ? a - b : b - a);
-      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
-      T diff = (left > right ? left - right : right - left);
-      val = a == b || left <= right || diff <= 1e-15;
-    }
-    out_data[i] = val;
-    // if (!val) *out_data = false;
-  }
-}
-
-template <typename T>
-struct IscloseFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor& in, const framework::Tensor& other,
-                  const double rtol, const double atol, bool equal_nan,
-                  framework::Tensor* output) {
-    int num = in.numel();
-    const T* in_data = in.data<T>();
-    const T* other_data = other.data<T>();
-    bool* out_data = output->mutable_data<bool>(dev_ctx.GetPlace());
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    grid = (grid > block) ? block : grid;
-#ifdef PADDLE_WITH_HIP
-    hipMemset(out_data, true, num * sizeof(bool));
-#else
-    cudaMemset(out_data, true, num * sizeof(bool));
-#endif
-    IscloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        in_data, other_data, rtol, atol, equal_nan, num, out_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(isclose, ops::IscloseKernel<CUDA, float>,
-                        ops::IscloseKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/isclose_op.h b/paddle/fluid/operators/isclose_op.h
deleted file mode 100644
index cde5d2afbf009..0000000000000
--- a/paddle/fluid/operators/isclose_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-struct GetTensorValue {
-  T operator()(const platform::DeviceContext& ctx,
-               const framework::Tensor& tensor) const;
-};
-
-template <typename DeviceContext, typename T>
-struct IscloseFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& other, const float rtol,
-                  const float atol, bool equal_nan, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-class IscloseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // get attrs
-    bool equal_nan = ctx.Attr<bool>("equal_nan");
-    // get input/output
-    const auto* input = ctx.Input<Tensor>("Input");
-    const auto* other = ctx.Input<Tensor>("Other");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    double rtol_v = std::stod(ctx.Attr<std::string>("rtol"));
-    double atol_v = std::stod(ctx.Attr<std::string>("atol"));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    GetTensorValue<DeviceContext, double> get_tensor_value;
-    if (ctx.HasInput("Rtol")) {
-      const auto* rtol = ctx.Input<Tensor>("Rtol");
-      PADDLE_ENFORCE_EQ(
-          rtol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) size must be 1, but get %d.", rtol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(rtol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Rtol) type must be double, but get %s.",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(rtol->dtype()))));
-      rtol_v = get_tensor_value(dev_ctx, *rtol);
-    }
-    if (ctx.HasInput("Atol")) {
-      const auto* atol = ctx.Input<Tensor>("Atol");
-      PADDLE_ENFORCE_EQ(
-          atol->numel(), 1,
-          platform::errors::InvalidArgument(
-              "Input(Atol) size must be 1, but get %d", atol->numel()));
-      PADDLE_ENFORCE_EQ(
-          framework::TransToProtoVarType(atol->dtype()),
-          framework::proto::VarType::FP64,
-          platform::errors::InvalidArgument(
-              "Input(Atol) type must be double, but get %s",
-              framework::DataTypeToString(
-                  framework::TransToProtoVarType(atol->dtype()))));
-      atol_v = get_tensor_value(dev_ctx, *atol);
-    }
-
-    IscloseFunctor<DeviceContext, T>()(dev_ctx, *input, *other, rtol_v, atol_v,
-                                       equal_nan, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index a78d8ec10149d..67c1942ea0b41 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -9,10 +9,11 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/operators/kldiv_loss_op.h"
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,44 +23,6 @@ using framework::Tensor;
 class KLDivLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLoss");
-    OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "KLDivLoss");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_target = ctx->GetInputDim("Target");
-    PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(X) rank and Input(Target) rank should be "
-                          "same, but received X rank(%d) != Target rank(%d)",
-                          dim_x.size(), dim_target.size()));
-    for (int i = 0; i < dim_x.size(); i++) {
-      if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) {
-        PADDLE_ENFORCE_EQ(
-            dim_x[i], dim_target[i],
-            platform::errors::InvalidArgument(
-                "Input(X) and Input(Target) should in same shape. but received "
-                "X dimension[%d](%d) != Target dimension[%d](%d)",
-                i, dim_x[i], i, dim_target[i]));
-      }
-    }
-
-    auto reduction = ctx->Attrs().Get<std::string>("reduction");
-
-    auto reduction_valid = "mean" == reduction || "sum" == reduction ||
-                           "batchmean" == reduction || "none" == reduction;
-    PADDLE_ENFORCE_EQ(
-        reduction_valid, true,
-        platform::errors::InvalidArgument(
-            "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
-
-    if ("none" == reduction) {
-      ctx->SetOutputDim("Loss", dim_x);
-    } else {
-      ctx->SetOutputDim("Loss", {1});
-    }
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -172,15 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss, KLDivInferShapeFunctor,
+                            PD_INFER_META(phi::KLDivInferMeta));
+
 REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker,
                   ops::KLDivLossOpGradMaker<paddle::framework::OpDesc>,
-                  ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>);
+                  ops::KLDivLossOpGradMaker<paddle::imperative::OpBase>,
+                  KLDivInferShapeFunctor);
 REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad,
                   ops::KLDivLossGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    kldiv_loss, ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    kldiv_loss_grad,
-    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/kldiv_loss_op.cu b/paddle/fluid/operators/kldiv_loss_op.cu
deleted file mode 100644
index 5226cb8c08e3d..0000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/kldiv_loss_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    kldiv_loss,
-    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    kldiv_loss_grad,
-    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h
deleted file mode 100644
index 5a6ef06f5eb1e..0000000000000
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using Array1 = Eigen::DSizes<int64_t, 1>;
-
-template <typename T>
-struct KLDivLossForward {
-  HOSTDEVICE KLDivLossForward() {}
-
-  HOSTDEVICE T operator()(const T& target, const T& input) const {
-    if (target <= 0) {
-      return 0;
-    } else {
-      return target * (std::log(target) - input);
-    }
-  }
-};
-
-template <typename T>
-struct KLDivLossBackward {
-  HOSTDEVICE KLDivLossBackward() {}
-
-  HOSTDEVICE T operator()(const T& target, const T& grad) const {
-    if (target <= 0) {
-      return 0;
-    } else {
-      return static_cast<T>(-1.) * grad;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KLDivLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* target = ctx.Input<Tensor>("Target");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto reduction = ctx.Attr<std::string>("reduction");
-
-    const int n = input->dims()[0];
-
-    loss->mutable_data<T>(ctx.GetPlace());
-    auto input_t = framework::EigenVector<T>::Flatten(*input);
-    auto target_t = framework::EigenVector<T>::Flatten(*target);
-    auto loss_t = framework::EigenVector<T>::Flatten(*loss);
-    auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
-    if ("none" == reduction) {
-      loss_t.device(place) = output;
-    } else if ("batchmean" == reduction) {
-      auto output_sum = output.sum();
-      if (n > 0) {
-        loss_t.device(place) = output_sum / output_sum.constant(n);
-      } else {
-        loss_t.device(place) = output_sum;
-      }
-    } else if ("mean" == reduction) {
-      loss_t.device(place) = output.mean();
-    } else if ("sum" == reduction) {
-      loss_t.device(place) = output.sum();
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KLDivLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto* target = ctx.Input<Tensor>("Target");
-    auto reduction = ctx.Attr<std::string>("reduction");
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-
-    const int n = input_grad->dims()[0];
-    const int numel = input_grad->numel();
-    const int expand = numel / loss_grad->numel();
-
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto target_t = framework::EigenVector<T>::Flatten(*target);
-
-    auto input_grad_t = framework::EigenVector<T>::Flatten(*input_grad);
-    auto loss_grad_t = framework::EigenVector<T>::Flatten(*loss_grad);
-
-    auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
-    auto grad_t = target_t * loss_grad_expand;
-    input_grad_t.device(place) =
-        target_t.binaryExpr(grad_t, KLDivLossBackward<T>());
-
-    if ("mean" == reduction) {
-      input_grad_t.device(place) = input_grad_t / static_cast<T>(numel);
-    } else if ("batchmean" == reduction) {
-      input_grad_t.device(place) = input_grad_t / static_cast<T>(n);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc
index 322ae5df4cb87..eac181489aa9d 100644
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the Licnse. */
 
-#include "paddle/fluid/operators/kldiv_loss_op.h"
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc
index 68d0c7978b4e4..60390016d66e3 100644
--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
@@ -17,7 +17,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -26,27 +28,6 @@ class KronOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kron");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "kron");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kron");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_y = ctx->GetInputDim("Y");
-    auto rank_x = dim_x.size();
-    auto rank_y = dim_y.size();
-    auto rank = (rank_x > rank_y) ? rank_x : rank_y;
-
-    std::vector<int64_t> dim_out;
-    dim_out.reserve(rank);
-    for (int i = 0; i < rank; i++) {
-      int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x));
-      int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y));
-      dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi);
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(dim_out));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -173,7 +154,10 @@ class KronGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(kron, KronInferShapeFunctor,
+                            PD_INFER_META(phi::KronInferMeta));
 REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker,
                   ops::KronGradOpMaker<paddle::framework::OpDesc>,
-                  ops::KronGradOpMaker<paddle::imperative::OpBase>);
+                  ops::KronGradOpMaker<paddle::imperative::OpBase>,
+                  KronInferShapeFunctor);
 REGISTER_OPERATOR(kron_grad, ops::KronGradOp);
diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc
index 2a79cee27814e..4c679d3026386 100644
--- a/paddle/fluid/operators/kthvalue_op.cc
+++ b/paddle/fluid/operators/kthvalue_op.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/kthvalue_op.h"
 #include <memory>
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kthvalue");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kthvalue");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "kthvalue");
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_LT(axis, dim_size,
-                      paddle::platform::errors::InvalidArgument(
-                          "the axis must be [-%d, %d), but received %d .",
-                          dim_size, dim_size, axis));
-    PADDLE_ENFORCE_GE(axis, -dim_size,
-                      paddle::platform::errors::InvalidArgument(
-                          "the axis must be [-%d, %d), but received %d .",
-                          dim_size, dim_size, axis));
-    if (axis < 0) axis += dim_size;
-    int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
-    PADDLE_ENFORCE_GE(
-        k, 1, paddle::platform::errors::InvalidArgument(
-                  "the k in the kthvalue must >= 1, but received %d .", k));
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of kthvalue must have >= 1d shape"));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(
-          input_dims[axis], k,
-          paddle::platform::errors::InvalidArgument(
-              "input of kthvalue must have >= %d columns in axis of %d", k,
-              axis));
-    }
-    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    std::vector<int64_t> dimvec;
-    for (int64_t i = 0; i < axis; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    if (keepdim) {
-      dimvec.emplace_back(static_cast<int64_t>(1));
-    }
-    for (int64_t i = axis + 1; i < dim_size; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    framework::DDim dims = phi::make_ddim(dimvec);
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(kthvalue, KthvalueInferShapeFunctor,
+                            PD_INFER_META(phi::KthvalueInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker,
                   ops::KthvalueGradOpMaker<paddle::framework::OpDesc>,
-                  ops::KthvalueGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    kthvalue, ops::KthvalueCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int64_t>);
+                  ops::KthvalueGradOpMaker<paddle::imperative::OpBase>,
+                  KthvalueInferShapeFunctor);
 
 REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    kthvalue_grad,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu
deleted file mode 100644
index f6f56f70f1a11..0000000000000
--- a/paddle/fluid/operators/kthvalue_op.cu
+++ /dev/null
@@ -1,278 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/kthvalue_op.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-#endif
-
-namespace paddle {
-namespace operators {
-
-int getBlockSize(int col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256 && col <= 512)
-    return 512;
-  else if (col > 128 && col <= 256)
-    return 256;
-  else if (col > 64 && col <= 128)
-    return 128;
-  else
-    return 64;
-}
-
-template <typename T>
-bool SortKthvalue(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor* input_tensor, const int64_t num_cols,
-                  const int64_t num_rows, const int k,
-                  framework::Tensor* out_tensor,
-                  framework::Tensor* indices_tensor) {
-  auto cu_stream = ctx.stream();
-  framework::Tensor input_indices;
-  const std::vector<int64_t> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
-  input_indices.Resize(dim);
-  input_indices.mutable_data<int64_t>(ctx.GetPlace());
-  size_t temp_storage_bytes = -1;
-  int block_size = getBlockSize(num_cols);
-  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0];
-  unsigned int grid_size = num_rows < maxGridDimX
-                               ? static_cast<unsigned int>(num_rows)
-                               : maxGridDimX;
-  InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
-      input_indices.data<int64_t>(), num_rows, num_cols);
-  cub::CountingInputIterator<int64_t> counting_iter(0);
-  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
-                              cub::CountingInputIterator<int64_t>>
-      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
-  T* sorted_values_ptr;
-  int64_t* sorted_indices_ptr;
-  framework::Tensor temp_values, temp_indices;
-  const T* input = input_tensor->data<T>();
-  T* values = out_tensor->data<T>();
-  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
-  temp_values.Resize(dim);
-  temp_indices.Resize(dim);
-  sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
-  sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
-  auto err = cub::DeviceSegmentedRadixSort::SortPairs(
-      nullptr, temp_storage_bytes, input, sorted_values_ptr,
-      input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
-      num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
-      cu_stream);
-#ifdef __HIPCC__
-  if (err != hipSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
-               << hipGetErrorString(err);
-    return false;
-  }
-#else
-  if (err != cudaSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "cub::DeviceSegmentedRadixSort::SortPairs, status: "
-               << cudaGetErrorString(err);
-    return false;
-  }
-#endif
-  framework::Tensor temp_storage;
-  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
-
-  err = cub::DeviceSegmentedRadixSort::SortPairs(
-      temp_storage.data<uint8_t>(), temp_storage_bytes, input,
-      sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
-      num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
-      0, sizeof(T) * 8, cu_stream);
-#ifdef __HIPCC__
-  if (err != hipSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "hipcub::DeviceSegmentedRadixSort::SortPairs, "
-               << temp_storage_bytes << ", status: " << hipGetErrorString(err);
-    return false;
-  }
-#else
-  if (err != cudaSuccess) {
-    LOG(ERROR) << "KthvalueOP failed as could not launch "
-                  "cub::DeviceSegmentedRadixSort::SortPairs, "
-               << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
-    return false;
-  }
-#endif
-  auto& dev = *ctx.eigen_device();
-  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, k - 1};
-  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, 1};
-  auto e_indices = framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
-  auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(
-      static_cast<const framework::Tensor>(temp_indices));
-  std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
-  dim = phi::make_ddim(odims);
-  auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
-  auto e_tmp_values = framework::EigenMatrix<T>::From(
-      static_cast<const framework::Tensor>(temp_values));
-
-  EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
-      dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
-  EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
-      dev, e_values, e_tmp_values, slice_indices, slice_sizes);
-  return true;
-}
-
-template <typename DeviceContext, typename T>
-class KthvalueOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int k = static_cast<int>(ctx.Attr<int>("k"));
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
-    const auto& in_dims = input->dims();
-    if (axis < 0) axis += in_dims.size();
-    auto out_dims = output->dims();
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      PADDLE_ENFORCE_EQ(SortKthvalue<T>(dev_ctx, input, input_width,
-                                        input_height, k, output, indices),
-                        true, platform::errors::External(
-                                  "KthvalueOP: Error when use cub sorting"));
-      return;
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dims);
-        indices->Resize(tmp_out_dims);
-      }
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(in_dims);
-      for (int i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = in_dims[trans[i]];
-      }
-      trans_out_dims[in_dims.size() - 1] = 1;
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans);
-      framework::Tensor trans_ind, trans_out;
-      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
-      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-      PADDLE_ENFORCE_EQ(
-          SortKthvalue<T>(dev_ctx, &trans_input, input_width, input_height, k,
-                          &trans_out, &trans_ind),
-          true,
-          platform::errors::External("KthvalueOP: Error when use cub sorting"));
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KthvalueOpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-    int k = static_cast<int>(context.Attr<int>("k"));
-    const auto& in_dims = x->dims();
-    auto out_dims = indices->dims();
-    if (axis < 0) axis += in_dims.size();
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-    auto& dev_ctx = context.cuda_device_context();
-    int block_size = getBlockSize(post * k);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    kthvalue,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    kthvalue_grad,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
diff --git a/paddle/fluid/operators/kthvalue_op.h b/paddle/fluid/operators/kthvalue_op.h
deleted file mode 100644
index 15df0a10c6992..0000000000000
--- a/paddle/fluid/operators/kthvalue_op.h
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-template <typename T, typename Type>
-static void getKthvalue(Type input_height, Type input_width, int input_dim,
-                        const framework::Tensor* input, T* t_out,
-                        Type* t_indices, const int& k) {
-  bool partial_sort_flag = (k * 64) < input_width;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    if (partial_sort_flag) {
-      std::partial_sort(
-          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
-          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            return (!std::isnan(static_cast<double>(l.first)) &&
-                    std::isnan(static_cast<double>(r.first))) ||
-                   (l.first < r.first);
-          });
-    } else {
-      std::nth_element(
-          col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
-          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-            return (!std::isnan(static_cast<double>(l.first)) &&
-                    std::isnan(static_cast<double>(r.first))) ||
-                   (l.first < r.first);
-          });
-    }
-    t_out[i] = col_vec[k - 1].first;
-    t_indices[i] = col_vec[k - 1].second;
-  }
-}
-
-template <typename T, typename Type>
-static void kthvalueAssign(const Type& input_height, const Type& input_width,
-                           const int& input_dim, const framework::Tensor* input,
-                           const framework::Tensor* indices, T* output_data) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      output_data[i * input_width + e_indices(0)] = e_input(0);
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class KthvalueCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("X");
-    auto* output = context.Output<framework::Tensor>("Out");
-    auto* indices = context.Output<framework::Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    int k = static_cast<int>(context.Attr<int>("k"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    auto out_dims = output->dims();
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                              output_data, indices_data, k);
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dims);
-        indices->Resize(tmp_out_dims);
-      }
-      framework::DDim trans_dims(in_dims);
-      framework::DDim trans_out_dims(in_dims);
-
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-        trans_out_dims[i] = in_dims[trans[i]];
-      }
-      trans_out_dims[in_dims.size() - 1] = 1;
-      framework::Tensor trans_inp;
-      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_inp, trans);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t input_width = trans_dims[trans_dims.size() - 1];
-      framework::Tensor tmp_out, tmp_indices;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
-      auto* t_ind =
-          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
-
-      getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(),
-                              &trans_inp, t_out, t_ind, k);
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class KthvalueGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-    auto in_dims = x->dims();
-    auto out_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    if (!keepdim) {
-      std::vector<int> tmp_out_shape;
-      for (int i = 0; i < axis; i++) {
-        tmp_out_shape.emplace_back(out_dims[i]);
-      }
-      tmp_out_shape.emplace_back(1);
-      for (int i = axis + 1; i < in_dims.size(); i++) {
-        tmp_out_shape.emplace_back(out_dims[i - 1]);
-      }
-      out_dims = phi::make_ddim(tmp_out_shape);
-    }
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis == in_dims.size() - 1) {
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      if (keepdim) {
-        kthvalueAssign(input_height, input_width, in_dims.size(), out_grad,
-                       indices, x_grad_data);
-      } else {
-        auto& dev_context =
-            context.template device_context<platform::CPUDeviceContext>();
-        framework::Tensor out_grad_tmp, indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        kthvalueAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
-                       &indices_tmp, x_grad_data);
-      }
-    } else {
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans.emplace_back(i);
-      }
-      trans.emplace_back(axis);
-      framework::DDim trans_dims(out_dims);
-      framework::DDim trans_in_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = out_dims[trans[i]];
-        trans_in_dims[i] = in_dims[trans[i]];
-      }
-      framework::Tensor trans_dO, trans_ind;
-      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
-      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
-      int ndims = trans.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-      if (keepdim) {
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, *out_grad, &trans_dO, trans);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, *indices, &trans_ind, trans);
-      } else {
-        framework::Tensor out_grad_tmp, indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, out_grad_tmp, &trans_dO, trans);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, indices_tmp, &trans_ind, trans);
-      }
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
-      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-      kthvalueAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                                 &trans_dO, &trans_ind, t_out);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 412ae3c49b5f3..c0a4b88fc76fd 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -758,12 +758,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
 */
 template <typename T, typename U, typename ScaleT = U,
           typename MaskType = uint8_t>
-void ln_bwd_1024_kernel_driver(
-    const platform::CUDADeviceContext &dev_ctx, const int rows, const int cols,
-    float epsilon, const T *x_ptr, const ScaleT *scale_ptr, const U *mean_ptr,
-    const U *var_ptr, const T *dout_ptr, T *dx_ptr, ScaleT *dscale_ptr,
-    ScaleT *dbias_ptr, const MaskType *mask_ptr = nullptr,
-    T factor = static_cast<T>(0), T *d_dropout_src_ptr = nullptr) {
+void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
+                               const int cols, float epsilon, const T *x_ptr,
+                               const ScaleT *scale_ptr, const U *mean_ptr,
+                               const U *var_ptr, const T *dout_ptr, T *dx_ptr,
+                               ScaleT *dscale_ptr, ScaleT *dbias_ptr,
+                               const MaskType *mask_ptr = nullptr,
+                               T factor = static_cast<T>(0),
+                               T *d_dropout_src_ptr = nullptr) {
   auto stream = dev_ctx.stream();
   if (cols == 1024) {
     // step-1: compute dx and reduced part results of dscale and dbias.
@@ -1334,8 +1336,7 @@ static void LayerNormBackward(
     const U *mean, const U *var, T *d_x,
     LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_scale,
     LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *d_bias, float epsilon,
-    int64_t batch_size, int64_t feature_size,
-    const platform::CUDADeviceContext &dev_ctx) {
+    int64_t batch_size, int64_t feature_size, const phi::GPUContext &dev_ctx) {
   auto stream = dev_ctx.stream();
 #ifdef __HIPCC__
   const int kMaxBlockDim = 256;
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index e7d676479be0c..224ab748dab6c 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
-
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -278,10 +277,3 @@ REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
                   ops::LayerNormGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp,
                   ops::LayerNormGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
deleted file mode 100644
index dfe73d3727132..0000000000000
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/operators/layer_norm_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
-                                               const T *input,
-                                               std::vector<int> input_shape,
-                                               const T *bias, const T *scale,
-                                               T *output, T *mean, T *variance,
-                                               int begin_norm_axis, float eps) {
-  const auto x_dims = phi::make_ddim(input_shape);
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-  switch (GetDesiredBlockDim(feature_size)) {
-    FIXED_BLOCK_DIM_CASE(
-        LayerNormForward<T, T, kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
-            input, scale, bias, output, mean, variance, eps, feature_size));
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Product from begin_norm_axis to end in layer_norm must be larger "
-          "than 1"));
-      break;
-  }
-}
-
-template <typename T>
-class LayerNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using U = LayerNormParamType<T>;
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *x = ctx.Input<Tensor>("X");
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x->dims();
-    auto *x_data = x->data<T>();
-    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
-    auto *mean_data = mean->mutable_data<U>(ctx.GetPlace());
-    auto *var_data = var->mutable_data<U>(ctx.GetPlace());
-
-    auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
-    auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
-
-    framework::proto::VarType::Type x_dtype =
-        framework::TransToProtoVarType(x->dtype());
-    framework::proto::VarType::Type scale_bias_dtype;
-    if (void_scale_data != nullptr) {
-      scale_bias_dtype = framework::TransToProtoVarType(scale->dtype());
-      if (void_bias_data != nullptr) {
-        PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                          framework::TransToProtoVarType(bias->dtype()),
-                          platform::errors::InvalidArgument(
-                              "Thie Scale and Bias of layer_norm op "
-                              "should have the same data type."));
-      }
-    } else {
-      scale_bias_dtype = (void_bias_data != nullptr
-                              ? framework::TransToProtoVarType(bias->dtype())
-                              : x_dtype);
-    }
-
-    bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
-    if (!is_scale_bias_same_dtype_with_x) {
-      PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                        framework::DataTypeTrait<U>::DataType(),
-                        platform::errors::InvalidArgument(
-                            "Unsupported data type of Scale and Bias: %s",
-                            framework::DataTypeToString(scale_bias_dtype)));
-    }
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-    int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-
-    auto stream = ctx.cuda_device_context().stream();
-
-#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    switch (GetDesiredBlockDim(feature_size)) {                            \
-      FIXED_BLOCK_DIM_CASE(                                                \
-          LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX><<<  \
-              batch_size, kBlockDim, 0, stream>>>(                         \
-              x_data, static_cast<const ScaleBiasT *>(void_scale_data),    \
-              static_cast<const ScaleBiasT *>(void_bias_data), y_data,     \
-              mean_data, var_data, epsilon, feature_size));                \
-      default:                                                             \
-        PADDLE_THROW(platform::errors::InvalidArgument(                    \
-            "Product from begin_norm_axis to end must be larger than 1")); \
-        break;                                                             \
-    }                                                                      \
-  } while (0)
-
-#ifdef PADDLE_WITH_CUDA
-    bool can_call_1024_kernel = false;
-    if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
-      can_call_1024_kernel = true;
-    }
-    if (can_call_1024_kernel) {
-      const int WARPS_M = 4;
-      const int WARPS_N = 1;
-      const int THREADS_PER_WARP = 32;
-      const int BYTES_PER_LDG = 16;
-      const int VecSize = BYTES_PER_LDG / sizeof(T);
-
-      const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
-      const int ROWS_PER_CTA = WARPS_M;
-
-      const int grid = static_cast<int>(
-          std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
-      if (is_scale_bias_same_dtype_with_x) {
-        ln_fwd_1024_kernel<T, U, T, VecSize, WARPS_M, WARPS_N,
-                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-            batch_size, feature_size, epsilon, x_data,
-            static_cast<const T *>(void_scale_data),
-            static_cast<const T *>(void_bias_data), mean_data, var_data,
-            y_data);
-      } else {
-        ln_fwd_1024_kernel<T, U, U, VecSize, WARPS_M, WARPS_N,
-                           BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
-            batch_size, feature_size, epsilon, x_data,
-            static_cast<const U *>(void_scale_data),
-            static_cast<const U *>(void_bias_data), mean_data, var_data,
-            y_data);
-      }
-    } else {
-#endif
-      if (is_scale_bias_same_dtype_with_x) {
-        PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
-      } else {
-        PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
-      }
-#ifdef PADDLE_WITH_CUDA
-    }
-#endif
-
-#undef PADDLE_LAUNCH_LAYERNORM_FWD
-  }
-};
-
-template <typename T>
-class LayerNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using U = LayerNormParamType<T>;
-    const float epsilon = ctx.Attr<float>("epsilon");
-    // d_x, d_scale, d_bias may be nullptr
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto *mean = ctx.Input<Tensor>("Mean");
-    auto *var = ctx.Input<Tensor>("Variance");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
-    int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-
-    auto *x_data = x->data<T>();
-    auto *d_y_data = d_y->data<T>();
-
-    auto *mean_data = mean->data<U>();
-    auto *var_data = var->data<U>();
-
-    auto *d_x_data =
-        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace()));
-
-    framework::proto::VarType::Type x_dtype =
-        framework::TransToProtoVarType(x->dtype());
-    framework::proto::VarType::Type scale_bias_dtype;
-    if (scale != nullptr) {
-      scale_bias_dtype = framework::TransToProtoVarType(scale->dtype());
-    } else {
-      // FIXME(zengjinle): do not find a better way to get the right
-      // data type of the d_scale and d_bias if scale == nullptr.
-      auto *bias = ctx.Input<Tensor>("Bias");
-      if (bias != nullptr) {
-        scale_bias_dtype = framework::TransToProtoVarType(bias->dtype());
-      } else {
-        scale_bias_dtype = x_dtype;
-      }
-    }
-
-#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    auto *scale_data =                                                     \
-        (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());          \
-    auto *d_scale_data =                                                   \
-        (d_scale == nullptr ? nullptr : d_scale->mutable_data<ScaleBiasT>( \
-                                            ctx.GetPlace()));              \
-    auto *d_bias_data =                                                    \
-        (d_bias == nullptr ? nullptr : d_bias->mutable_data<ScaleBiasT>(   \
-                                           ctx.GetPlace()));               \
-    auto *d_x_data =                                                       \
-        (d_x == nullptr ? nullptr : d_x->mutable_data<T>(ctx.GetPlace())); \
-    LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(                    \
-        x_data, d_y_data, scale_data, mean_data, var_data, d_x_data,       \
-        d_scale_data, d_bias_data, epsilon, batch_size, feature_size,      \
-        ctx.cuda_device_context());                                        \
-  } while (0)
-
-    if (scale_bias_dtype == x_dtype) {
-      PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
-    } else {
-      PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
-    }
-
-#undef PADDLE_LAUNCH_LAYERNORM_BWD
-  }
-};
-
-template class LayerNormDirectCUDAFunctor<float>;
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
-#elif CUDNN_VERSION_MIN(8, 1, 0)
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::bfloat16>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    layer_norm_grad,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>);
-#endif
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
deleted file mode 100644
index 9d70b7cf70743..0000000000000
--- a/paddle/fluid/operators/layer_norm_op.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
-    !defined(__OSX__)
-#include "paddle/fluid/operators/jit/kernels.h"
-#endif
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-// Wrap RowwiseMean and ColwiseMean.
-// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
-// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
-// implementation only considers 2D.
-template <typename DeviceContext, typename T>
-struct RowwiseMean2D {
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class RowwiseMean2D<platform::CUDADeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({right_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
-  }
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class RowwiseMean2D<platform::CPUDeviceContext, T> {
- public:
-  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    row_mean_(context, input, out);
-  }
-
- private:
-  phi::funcs::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
-};
-
-template <typename DeviceContext, typename T>
-struct ColwiseSum2D {
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
-
-  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* vec);
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class ColwiseSum2D<platform::CUDADeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
-      : left_(left), right_(right) {
-    framework::DDim ones_dim({left_});
-    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0);
-  }
-
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
-        true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
-        out->data<T>());
-  }
-
- private:
-  int left_;
-  int right_;
-  framework::Tensor divisor_;
-};
-#endif
-
-template <typename T>
-class ColwiseSum2D<platform::CPUDeviceContext, T> {
- public:
-  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
-    col_wise_(context, input, out);
-  }
-
- private:
-  phi::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
-};
-
-template <typename T>
-struct SubAndSquareFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
-};
-
-template <typename T>
-struct DivAndSqrtFunctor {
-  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a / (sqrt(b + epsilon_));
-  }
-
- private:
-  T epsilon_;
-};
-
-template <typename T>
-struct MulInvVarFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const {
-    return a * std::sqrt(1.0 / b);
-  }
-};
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-class LayerNormDirectCUDAFunctor {
- public:
-  void operator()(gpuStream_t stream, const T* input,
-                  std::vector<int> input_shape, const T* bias, const T* scale,
-                  T* output, T* mean, T* variance, int begin_norm_axis,
-                  float eps);
-};
-#endif
-
-template <typename DeviceContext, typename T>
-class LayerNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto x = *ctx.Input<Tensor>("X");
-
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* var = ctx.Output<Tensor>("Variance");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    const auto x_dims = x.dims();
-
-    y->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    x.Resize(matrix_shape);
-    Tensor out;
-    out.ShareDataWith(*y);
-    out.Resize(matrix_shape);
-
-#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
-    defined(__OSX__)
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
-
-    // get mean
-    row_mean(dev_ctx, x, mean);
-
-    // get variance
-    ElementwiseComputeEx<SubAndSquareFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor<T>(), &out);
-    row_mean(dev_ctx, out, var);
-
-    // get x_norm
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-        ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &out);
-    ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-        ctx, &out, var, /*axis*/ 0,
-        DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &out);
-
-    if (scale) {
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &out, scale, /*axis*/ 1, MulFunctor<T>(), &out);
-    }
-    if (bias) {
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, &out, bias, /*axis*/ 1, AddFunctor<T>(), &out);
-    }
-#else
-    PADDLE_ENFORCE_EQ(mean->numel(), left,
-                      platform::errors::InvalidArgument(
-                          "mean's length (%d) is not equal with expected (%d).",
-                          mean->numel(), left));
-    PADDLE_ENFORCE_EQ(var->numel(), left,
-                      platform::errors::InvalidArgument(
-                          "var's length (%d) is not equal with expected (%d).",
-                          var->numel(), left));
-    if (scale) {
-      PADDLE_ENFORCE_EQ(
-          scale->numel(), right,
-          platform::errors::InvalidArgument(
-              "scale's length (%d) is not equal with expected (%d).",
-              scale->numel(), right));
-    }
-    if (bias) {
-      PADDLE_ENFORCE_EQ(
-          bias->numel(), right,
-          platform::errors::InvalidArgument(
-              "bias's length (%d) is not equal with expected (%d).",
-              bias->numel(), right));
-    }
-
-    auto ker =
-        jit::KernelFuncs<jit::LayerNormTuple<T>, platform::CPUPlace>::Cache()
-            .At(right);
-    ker(x.data<T>(), out.data<T>(), mean->data<T>(), var->data<T>(),
-        scale ? scale->data<T>() : nullptr, bias ? bias->data<T>() : nullptr,
-        static_cast<int>(left), static_cast<const float>(epsilon), right);
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LayerNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto x = *ctx.Input<Tensor>("X");
-    auto* mean = ctx.Input<Tensor>("Mean");
-    auto* var = ctx.Input<Tensor>("Variance");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-
-    // init output
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    const auto& x_dims = x.dims();
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int left = static_cast<int>(matrix_dim[0]);
-    int right = static_cast<int>(matrix_dim[1]);
-    framework::DDim matrix_shape({left, right});
-
-    d_y.Resize(matrix_shape);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
-                                               ctx.device_context());
-
-    Tensor temp;
-    Tensor temp_norm;
-    if (d_scale || d_x) {
-      x.Resize(matrix_shape);
-      temp.mutable_data<T>(matrix_shape, ctx.GetPlace());
-
-      temp_norm.mutable_data<T>(matrix_shape, ctx.GetPlace());
-      // get x_norm
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &x, mean, /*axis*/ 0, SubFunctor<T>(), &temp_norm);
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), &temp_norm);
-    }
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      colwise_sum(dev_ctx, d_y, d_bias);
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor<T>(), &temp);
-      colwise_sum(dev_ctx, temp, d_scale);
-    }
-
-    if (d_x) {
-      framework::DDim vec_shape({left});
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto dx_dim = d_x->dims();
-      Tensor temp_vec;
-      temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
-
-      RowwiseMean2D<DeviceContext, T> row_mean(left, right,
-                                               ctx.device_context());
-
-      if (d_scale) {
-        // dy_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
-        framework::TensorCopy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, temp, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      } else {
-        // dy_dx
-        framework::TensorCopy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
-
-        // dy_dmean_dx
-        row_mean(dev_ctx, d_y, &temp_vec);
-        ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-            ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-        // dy_var_dx
-        ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-            ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
-      }
-      // dy_var_dx
-      row_mean(dev_ctx, temp, &temp_vec);
-      ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
-          ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor<T>(), &temp);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, d_x, &temp, /*axis*/ 0, SubFunctor<T>(), d_x);
-
-      ElementwiseComputeEx<DivAndSqrtFunctor<T>, DeviceContext, T>(
-          ctx, d_x, var, /*axis*/ 0,
-          DivAndSqrtFunctor<T>(static_cast<T>(epsilon)), d_x);
-      d_x->Resize(dx_dim);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
index c88880b43fff9..3c7e5bf9593e0 100644
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc
index 0480a354c8bd8..3b21a55f8df0d 100644
--- a/paddle/fluid/operators/layer_norm_op_xpu.cc
+++ b/paddle/fluid/operators/layer_norm_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lgamma_op.cc b/paddle/fluid/operators/lgamma_op.cc
index 148fb05afcfd9..72c6b41efa989 100644
--- a/paddle/fluid/operators/lgamma_op.cc
+++ b/paddle/fluid/operators/lgamma_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/lgamma_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -35,16 +38,6 @@ This operator performs elementwise lgamma for input $X$.
 class LgammaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Lgamma");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Lgamma");
-
-    auto in_dims = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 template <typename T>
@@ -83,17 +76,12 @@ class LgammaGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(lgamma, LgammaInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMeta));
+
 REGISTER_OPERATOR(lgamma, ops::LgammaOp, ops::LgammaOpMaker,
                   ops::LgammaGradMaker<paddle::framework::OpDesc>,
-                  ops::LgammaGradMaker<paddle::imperative::OpBase>);
+                  ops::LgammaGradMaker<paddle::imperative::OpBase>,
+                  LgammaInferShapeFunctor);
 
 REGISTER_OPERATOR(lgamma_grad, ops::LgammaGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    lgamma, ops::LgammaKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LgammaKernel<paddle::platform::CPUDeviceContext, double>)
-
-REGISTER_OP_CPU_KERNEL(
-    lgamma_grad,
-    ops::LgammaGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LgammaGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu
deleted file mode 100644
index b9f273727b00b..0000000000000
--- a/paddle/fluid/operators/lgamma_op.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/lgamma_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct CudaLgammaFunctor {
-  __device__ __forceinline__ T operator()(const T x) const {
-    return Eigen::numext::lgamma(x);
-  }
-};
-
-template <typename T>
-class LgammaKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx = context.device_context<platform::CUDADeviceContext>();
-    std::vector<const framework::Tensor*> ins = {x};
-    std::vector<framework::Tensor*> outs = {out};
-    auto functor = CudaLgammaFunctor<T>();
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    lgamma, ops::LgammaKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LgammaKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    lgamma_grad,
-    ops::LgammaGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LgammaGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lgamma_op.h b/paddle/fluid/operators/lgamma_op.h
deleted file mode 100644
index 674054e745732..0000000000000
--- a/paddle/fluid/operators/lgamma_op.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <unsupported/Eigen/SpecialFunctions>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct LgammaFunctor {
-  LgammaFunctor(const T* input, T* output, int64_t numel)
-      : input_(input), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = Eigen::numext::lgamma(input_[idx]);
-  }
-
- private:
-  const T* input_;
-  T* output_;
-  int64_t numel_;
-};
-
-template <typename T>
-struct LgammaGradFunctor {
-  LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
-      : dout_(dout), x_(x), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]);
-  }
-
- private:
-  const T* dout_;
-  const T* x_;
-  T* output_;
-  int64_t numel_;
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class LgammaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    Tensor* out = context.Output<Tensor>("Out");
-
-    auto numel = x->numel();
-    auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace(),
-                                          size_t(x->numel() * sizeof(T)));
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    LgammaFunctor<T> functor(x_data, out_data, numel);
-    for_range(functor);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LgammaGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const framework::Tensor* d_out =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
-    framework::Tensor* d_x =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto numel = d_out->numel();
-    auto* dout_data = d_out->data<T>();
-    auto* x_data = x->data<T>();
-    auto* dx_data = d_x->mutable_data<T>(
-        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
-    LgammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
-    for_range(functor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc
index 0e69b397e04c7..da38f906b9bd3 100644
--- a/paddle/fluid/operators/log_softmax_op.cc
+++ b/paddle/fluid/operators/log_softmax_op.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/log_softmax_op.h"
 #include <string>
 #include <unordered_map>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -24,10 +27,6 @@ class LogSoftmaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    return UnaryOpUnchangedInferShapeCheckAxis(ctx);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -123,18 +122,11 @@ class LogSoftmaxGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
+DECLARE_INFER_SHAPE_FUNCTOR(log_softmax, LogSoftmaxInferShapeFunctor,
+                            PD_INFER_META(phi::UnchangedInferMetaCheckAxis));
 REGISTER_OPERATOR(log_softmax, ops::LogSoftmaxOp, ops::LogSoftmaxOpMaker,
                   ops::LogSoftmaxOpInferVarType,
                   ops::LogSoftmaxGradOpMaker<paddle::framework::OpDesc>,
-                  ops::LogSoftmaxGradOpMaker<paddle::imperative::OpBase>);
+                  ops::LogSoftmaxGradOpMaker<paddle::imperative::OpBase>,
+                  LogSoftmaxInferShapeFunctor);
 REGISTER_OPERATOR(log_softmax_grad, ops::LogSoftmaxGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    log_softmax,
-    ops::LogSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LogSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    log_softmax_grad,
-    ops::LogSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LogSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
deleted file mode 100644
index 26b6ce43303d1..0000000000000
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/log_softmax_op.h"
-#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class LogSoftmaxKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<Tensor>("X");
-    auto *out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int input_axis = ctx.Attr<int>("axis");
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::SoftmaxForwardCUDAKernelDriver<T, true>(dev_ctx, *x, input_axis, out);
-  }
-};
-
-template <typename T>
-class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *out = ctx.Input<Tensor>("Out");
-    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    int input_axis = ctx.Attr<int>("axis");
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::SoftmaxBackwardCUDAKernelDriver<T, true>(dev_ctx, *out, *dout,
-                                                  input_axis, dx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#ifdef PADDLE_WITH_HIP
-REGISTER_OP_CUDA_KERNEL(
-    log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(
-    log_softmax_grad, ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::bfloat16>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(
-    log_softmax_grad, ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::bfloat16>);
-#endif
diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h
deleted file mode 100644
index 162087a75662d..0000000000000
--- a/paddle/fluid/operators/log_softmax_op.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-static inline int CanonicalAxis(const int axis, const int rank) {
-  if (axis < 0) {
-    return axis + rank;
-  }
-  return axis;
-}
-
-static inline size_t SizeToAxis(const int axis, const framework::DDim dims) {
-  size_t size = 1;
-  for (int i = 0; i < axis; i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline size_t SizeFromAxis(const int axis, const framework::DDim dims) {
-  size_t size = 1;
-  for (int i = axis; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-template <typename T>
-struct ValueClip {
-  HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = static_cast<T>(-64.);
-    return x < kThreshold ? kThreshold : x;
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct LogSoftmaxFunctor {
-  void operator()(const DeviceContext& context, const framework::Tensor* X,
-                  framework::Tensor* Y, const int axis) {
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-    constexpr int kAxisDim = 1;
-
-    int axis_dim = X->dims()[axis];
-    const int n = SizeToAxis(axis, X->dims());
-    const int d = SizeFromAxis(axis, X->dims());
-    framework::DDim dim_2d{n, d};
-
-    auto logits = EigenMatrix<T>::From(*X, dim_2d);
-    auto log_softmax = EigenMatrix<T>::From(*Y, dim_2d);
-
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
-    const int num_remain = num_classes / axis_dim;
-
-    Eigen::DSizes<int, 1> along_axis(kAxisDim);
-    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
-    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
-    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
-    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-
-    // For numerical stability, logits should be shifted by maximum number along
-    // axis, calculate shifted_logits into log_softmax tensor for memory reuse.
-    if (num_remain == 1) {
-      // axis == -1, axis and class in same dimension, calculate along
-      // class dimension directly for higher performance
-      log_softmax.device(*context.eigen_device()) =
-          (logits -
-           logits.maximum(along_axis)
-               .eval()
-               .reshape(batch_by_one)
-               .broadcast(one_by_class))
-              .unaryExpr(ValueClip<T>());
-    } else {
-      // axis != -1, class dimension split into (axis, remain), max and sum
-      // should be calculated along axis dimension
-      log_softmax.device(*context.eigen_device()) =
-          (logits.reshape(batch_axis_remain) -
-           logits.reshape(batch_axis_remain)
-               .maximum(along_axis)
-               .eval()
-               .reshape(batch_one_remain)
-               .broadcast(one_axis_one)
-               .reshape(batch_classes))
-              .unaryExpr(ValueClip<T>());
-    }
-
-    log_softmax.device(*context.eigen_device()) =
-        log_softmax -
-        log_softmax.exp()
-            .eval()
-            .reshape(batch_axis_remain)
-            .sum(along_axis)
-            .log()
-            .broadcast(one_axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogSoftmaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Out = context.Output<framework::Tensor>("Out");
-    const int rank = X->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    // allocate memory on device.
-    Out->mutable_data<T>(context.GetPlace());
-
-    if (X->numel() != 0) {
-      LogSoftmaxFunctor<DeviceContext, T>()(
-          context.template device_context<DeviceContext>(), X, Out, axis);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-struct LogSoftmaxGradFunctor {
-  void operator()(const DeviceContext& context, const framework::Tensor* Y,
-                  const framework::Tensor* dY, framework::Tensor* dX,
-                  const int axis) {
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-
-    const int n = SizeToAxis(axis, Y->dims());
-    const int d = SizeFromAxis(axis, Y->dims());
-    framework::DDim dim_2d{n, d};
-
-    auto y = EigenMatrix<T>::From(*Y, dim_2d);
-    auto dy = EigenMatrix<T>::From(*dY, dim_2d);
-    auto dx = EigenMatrix<T>::From(*dX, dim_2d);
-
-    const int axis_dim = Y->dims()[axis];
-    const int batch_size = y.dimension(kBatchDim);
-    const int num_classes = y.dimension(kClassDim);
-    const int num_remain = num_classes / axis_dim;
-
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
-
-    dx.device(*context.eigen_device()) =
-        dy -
-        (y.exp()) * (dy.reshape(batch_axis_remain)
-                         .sum(along_class)
-                         .broadcast(one_axis));
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogSoftmaxGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<framework::Tensor>("Out");
-    auto* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    const int rank = Out->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-
-    // allocate memory on device.
-    dX->mutable_data<T>(context.GetPlace());
-
-    if (Out->numel() != 0) {
-      LogSoftmaxGradFunctor<DeviceContext, T>()(
-          context.template device_context<DeviceContext>(), Out, dOut, dX,
-          axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc
index 5795f1dffac78..6ce21aec9215a 100644
--- a/paddle/fluid/operators/log_softmax_op_npu.cc
+++ b/paddle/fluid/operators/log_softmax_op_npu.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/log_softmax_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -27,7 +28,7 @@ class LogSoftmaxNPUKernel : public framework::OpKernel<T> {
     auto* X = ctx.Input<framework::Tensor>("X");
     auto* Out = ctx.Output<framework::Tensor>("Out");
     const int rank = X->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
     Out->mutable_data<T>(ctx.GetPlace());
 
     if (X->numel() != 0) {
@@ -47,7 +48,7 @@ class LogSoftmaxGradNPUKernel : public framework::OpKernel<T> {
     auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     const int rank = dOut->dims().size();
-    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
 
     // allocate memory on device.
     dX->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cc b/paddle/fluid/operators/lookup_table_v2_op.cc
index 47a00a93a6472..48ae080783d11 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op.cc
@@ -203,14 +203,6 @@ REGISTER_OPERATOR(lookup_table_v2_grad, ops::LookupTableV2OpGrad,
                   ops::LookupTableV2GradOpNoBufferVarsInferer,
                   ops::LookupTableV2OpGradVarTypeInference);
 
-REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel<float>,
-                       ops::LookupTableV2Kernel<double>,
-                       ops::LookupTableV2Kernel<paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    lookup_table_v2_grad, ops::LookupTableV2GradKernel<float>,
-    ops::LookupTableV2GradKernel<double>,
-    ops::LookupTableV2GradKernel<paddle::platform::bfloat16>);
-
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(lookup_table_v2)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index d40b264378570..74d089e23a82c 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -235,13 +235,3 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
 
 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(lookup_table_v2, ops::LookupTableV2CUDAKernel<float>,
-                        ops::LookupTableV2CUDAKernel<double>,
-                        ops::LookupTableV2CUDAKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(lookup_table_v2_grad,
-                        ops::LookupTableV2GradCUDAKernel<float>,
-                        ops::LookupTableV2GradCUDAKernel<double>,
-                        ops::LookupTableV2GradCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 65297abe3e49b..88d70d9bb7dae 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -221,7 +221,7 @@ class LRNOp : public framework::OperatorWithKernel {
       auto ar = paddle::framework::AttrReader(attrs);
       const std::string data_format = ar.Get<std::string>("data_format");
       auto dl = framework::StringToDataLayout(data_format);
-      // Some models may have intentionally set "AnyLayout" for pool
+      // Some models may have intentionally set "AnyLayout" for lrn
       // op. Treat this as NCHW (default data_format value)
       if (dl != framework::DataLayout::kAnyLayout) {
         return framework::OpKernelType(expected_kernel_type.data_type_,
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index 214b2eccae9f7..2414ae68438fd 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
 namespace paddle {
@@ -404,11 +404,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU,
   const auto W = udims[udims.size() - 1];
   auto L_dataptr = L->mutable_data<T>(dev_ctx.GetPlace());
   platform::ForRange<DeviceContext> x_for_range(dev_ctx, LU->numel());
-  TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W, L_dataptr);
+  phi::funcs::TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W,
+                                               L_dataptr);
   x_for_range(tril_computer);
 
-  TrilTriuCompute<T> triu_computer(LU->data<T>(), 0, false, H, W,
-                                   U->mutable_data<T>(dev_ctx.GetPlace()));
+  phi::funcs::TrilTriuCompute<T> triu_computer(
+      LU->data<T>(), 0, false, H, W, U->mutable_data<T>(dev_ctx.GetPlace()));
   x_for_range(triu_computer);
 
   // set L's diagonal 1
@@ -532,15 +533,15 @@ class LUGradKernel : public framework::OpKernel<T> {
     auto phil_rank = LmHdims.size();
     auto phiu_rank = UmHdims.size();
     platform::ForRange<DeviceContext> l_for_range(dev_ctx, phi_L.numel());
-    TrilTriuCompute<T> tril_computer(phi_L.data<T>(), -1, true,
-                                     LmHdims[phil_rank - 2],
-                                     LmHdims[phil_rank - 1], phi_L.data<T>());
+    phi::funcs::TrilTriuCompute<T> tril_computer(
+        phi_L.data<T>(), -1, true, LmHdims[phil_rank - 2],
+        LmHdims[phil_rank - 1], phi_L.data<T>());
     l_for_range(tril_computer);
 
     platform::ForRange<DeviceContext> u_for_range(dev_ctx, phi_U.numel());
-    TrilTriuCompute<T> triu_computer(phi_U.data<T>(), 0, false,
-                                     UmHdims[phiu_rank - 2],
-                                     UmHdims[phiu_rank - 1], phi_U.data<T>());
+    phi::funcs::TrilTriuCompute<T> triu_computer(
+        phi_U.data<T>(), 0, false, UmHdims[phiu_rank - 2],
+        UmHdims[phiu_rank - 1], phi_U.data<T>());
     u_for_range(triu_computer);
 
     Tensor_Add<DeviceContext, T>(dev_ctx, phi_L, phi_U, &phi);
@@ -591,8 +592,9 @@ class LUGradKernel : public framework::OpKernel<T> {
         const auto W = phidims[phidims.size() - 1];
         platform::ForRange<DeviceContext> x_for_range(dev_ctx,
                                                       phi_complement.numel());
-        TrilTriuCompute<T> tril_computer(phi_complement.data<T>(), -1, true, H,
-                                         W, phi_complement_l.data<T>());
+        phi::funcs::TrilTriuCompute<T> tril_computer(
+            phi_complement.data<T>(), -1, true, H, W,
+            phi_complement_l.data<T>());
         x_for_range(tril_computer);
 
         Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_l, &phi);
@@ -664,8 +666,8 @@ class LUGradKernel : public framework::OpKernel<T> {
       const auto W = phidims[phidims.size() - 1];
       platform::ForRange<DeviceContext> x_for_range(dev_ctx,
                                                     phi_complement.numel());
-      TrilTriuCompute<T> triu_computer(phi_complement.data<T>(), 0, false, H, W,
-                                       phi_complement_u.data<T>());
+      phi::funcs::TrilTriuCompute<T> triu_computer(
+          phi_complement.data<T>(), 0, false, H, W, phi_complement_u.data<T>());
       x_for_range(triu_computer);
 
       Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_u, &phi);
diff --git a/paddle/fluid/operators/lu_unpack_op.h b/paddle/fluid/operators/lu_unpack_op.h
index d2303f2c08da8..e4100867dc685 100644
--- a/paddle/fluid/operators/lu_unpack_op.h
+++ b/paddle/fluid/operators/lu_unpack_op.h
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/lu_op.h"
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 
 namespace paddle {
 namespace operators {
@@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
     auto W = ldims[ldims.size() - 1];
     auto L_dataptr = dl_tril.mutable_data<T>(dev_ctx.GetPlace());
     platform::ForRange<DeviceContext> l_for_range(dev_ctx, dl->numel());
-    TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W, L_dataptr);
+    phi::funcs::TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W,
+                                                 L_dataptr);
     l_for_range(tril_computer);
 
     const auto udims = du->dims();
@@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
     W = udims[udims.size() - 1];
     auto U_dataptr = du_triu.mutable_data<T>(dev_ctx.GetPlace());
     platform::ForRange<DeviceContext> u_for_range(dev_ctx, du->numel());
-    TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W, U_dataptr);
+    phi::funcs::TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W,
+                                                 U_dataptr);
     u_for_range(triu_computer);
 
     auto xdims = dx->dims();
diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc
index a6eb535c693b8..1887bbcfb7efd 100644
--- a/paddle/fluid/operators/masked_select_op.cc
+++ b/paddle/fluid/operators/masked_select_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,16 +23,6 @@ class MaskedSelectOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect");
-    OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect");
-    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect");
-
-    // output will only be a 1-D Tensor
-    ctx->SetOutputDim("Y", phi::make_ddim({-1}));
-    ctx->ShareLoD("X", /*->*/ "Y");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -100,8 +92,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(MaskedSelectedGradNoNeedBufferVarsInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(masked_select, MaksedSelectInferShapeFunctor,
+                            PD_INFER_META(phi::MaskedSelectInferMeta));
+
 REGISTER_OPERATOR(masked_select, ops::MaskedSelectOp, ops::MaskedSelectOpMaker,
                   ops::MaskedSelectGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MaskedSelectGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MaskedSelectGradOpMaker<paddle::imperative::OpBase>,
+                  MaksedSelectInferShapeFunctor);
 REGISTER_OPERATOR(masked_select_grad, ops::MaskedSelectOpGrad,
                   ops::MaskedSelectedGradNoNeedBufferVarsInferer);
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index c9308d27c0a34..e1861b2f7c5ea 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -243,8 +243,6 @@ class ConcatFunctor<platform::MLUDeviceContext, T> {
 
     const int axis_t = axis;
     const int ins_size_t = ins_size;
-    auto place = context.GetPlace();
-    output->mutable_data<T>(place);
 
     // mlu should do sth
     // init ins tensors
@@ -295,7 +293,6 @@ class SplitFunctor<platform::MLUDeviceContext, T> {
     std::vector<cnnlTensorDescriptor_t> desc_vector;
     for (size_t i = 0; i < out_size; i++) {
       (*outputs)[i]->Resize(outs_dims[i]);
-      (*outputs)[i]->mutable_data<T>(context.GetPlace());
       output_descs.emplace_back(
           MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY,
                             ToCnnlDataType((*outputs)[i]->dtype())));
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 9994ccc10cb13..b77e23450360c 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -34,10 +34,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename InputIterator, typename OutputIterator, typename BinaryOp>
+template <typename InputIterator, typename OutputIterator, typename BinaryOp,
+          typename Context>
 static void CubInclusiveScan(InputIterator x_iter, OutputIterator y_iter,
-                             size_t n, BinaryOp op,
-                             const platform::CUDADeviceContext &dev_ctx) {
+                             size_t n, BinaryOp op, const Context &dev_ctx) {
   memory::AllocationPtr allocation;
   void *temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
@@ -185,11 +185,10 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y,
   }
 }
 
-template <typename T, typename BinaryOp>
+template <typename T, typename BinaryOp, typename Context>
 static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
                                   size_t inner_dim, T init, BinaryOp op,
-                                  bool reverse,
-                                  const platform::CUDADeviceContext &dev_ctx) {
+                                  bool reverse, const Context &dev_ctx) {
   constexpr size_t kThreadNumX = 16;
   constexpr size_t kThreadNumY = 32;
 
@@ -209,10 +208,10 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim,
   }
 }
 
-template <typename T, typename BinaryOp>
+template <typename T, typename BinaryOp, typename Context>
 void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim,
                    size_t inner_dim, T init, BinaryOp op, bool reverse,
-                   const platform::CUDADeviceContext &dev_ctx) {
+                   const Context &dev_ctx) {
   if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
 
   if (outer_dim == 1 && inner_dim == 1) {
@@ -224,8 +223,7 @@ void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim,
       CubInclusiveScan(x, y, mid_dim, op, dev_ctx);
     }
   } else if (inner_dim != 1) {
-    platform::ForRange<platform::CUDADeviceContext> for_range(
-        dev_ctx, outer_dim * inner_dim);
+    platform::ForRange<Context> for_range(dev_ctx, outer_dim * inner_dim);
     if (reverse) {
       for_range(
           InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/true>(
diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc
index cdf204628b638..56f65340ea999 100644
--- a/paddle/fluid/operators/matrix_power_op.cc
+++ b/paddle/fluid/operators/matrix_power_op.cc
@@ -14,8 +14,11 @@
 
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,26 +26,6 @@ namespace operators {
 class MatrixPowerOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matrix_power");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matrix_power");
-    auto dims = ctx->GetInputDim("X");
-    auto n_dim = dims.size();
-    PADDLE_ENFORCE_GE(n_dim, 2,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) should have at least 2 dimensions. But "
-                          "received a %d dimension tensor.",
-                          n_dim));
-    PADDLE_ENFORCE_EQ(dims[n_dim - 2], dims[n_dim - 1],
-                      platform::errors::InvalidArgument(
-                          "The inner-most 2 dimensions of Input(X) all should "
-                          "be square matrices "
-                          "But received X's shape[-2] = %d and shape[-1] = %d.",
-                          dims[n_dim - 2], dims[n_dim - 1]));
-    ctx->SetOutputDim("Out", dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class MatrixPowerOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -116,9 +99,14 @@ class MatrixPowerGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(matrix_power, MatrixPowerInferShapeFunctor,
+                            PD_INFER_META(phi::MatrixPowerInferMeta));
+
 REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker,
                   ops::MatrixPowerOpInferVarType,
                   ops::MatrixPowerGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MatrixPowerGradOpMaker<paddle::imperative::OpBase>);
+                  ops::MatrixPowerGradOpMaker<paddle::imperative::OpBase>,
+                  MatrixPowerInferShapeFunctor);
 
 REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp);
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 83fe1aa6dd148..785b16ae283b9 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -98,9 +98,17 @@ REGISTER_OP_CPU_KERNEL(
     mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MeanKernel<paddle::platform::CPUDeviceContext, double>,
     ops::MeanKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::bfloat16>);
+                    paddle::platform::bfloat16>,
+    ops::MeanKernel<paddle::platform::CPUDeviceContext,
+                    paddle::platform::complex<float>>,
+    ops::MeanKernel<paddle::platform::CPUDeviceContext,
+                    paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::bfloat16>);
+                        paddle::platform::bfloat16>,
+    ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
+                        paddle::platform::complex<float>>,
+    ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
+                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 01a5632a960c3..e8964765ec654 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -102,10 +102,17 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     mean, ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, float>,
     ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::complex<float>>,
+    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     mean_grad,
     ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
     ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext,
-                            plat::float16>);
+                            paddle::platform::complex<float>>,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext,
+                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 812c55cdd5055..2e82b47e8da1c 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/common/data_type.h"
 
 namespace paddle {
 namespace operators {
@@ -139,7 +140,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     layer_norm_p->execute(astream, args);
     astream.wait();
 
-    y->set_layout(DataLayout::kMKLDNN);
+    y->set_layout(phi::DataLayout::kMKLDNN);
     y->set_format(platform::GetMKLDNNFormat(*dst_memory));
   }
 };
diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
index bdb4fe1198a8e..86ecb01c89af7 100644
--- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
@@ -50,13 +50,8 @@ class PReluMKLDNNHandler
       if (weights->dims().size() != x->dims().size()) {
         auto new_weights_dims = std::vector<int64_t>(x->dims().size(), 1);
         if (mode == "channel") {
-          if (data_format == "NHWC") {
-            new_weights_dims[x->dims().size() - 1] =
-                *std::max_element(weights_dims.begin(), weights_dims.end());
-          } else {
-            new_weights_dims[1] =
-                *std::max_element(weights_dims.begin(), weights_dims.end());
-          }
+          new_weights_dims[1] =
+              *std::max_element(weights_dims.begin(), weights_dims.end());
         }
         weights_dims = std::move(new_weights_dims);
       }
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 717af61b858dc..0e988557df626 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -98,7 +98,7 @@ TEST(test_pool2d_transpose_nhwc, cpu_place) {
 
 TEST(test_pool2d_relu_relu_nhwc, cpu_place) {
   framework::DDim dims({1, 4, 8, 512});           // NHWC shape
-  framework::DDim expected_dims({1, 512, 3, 7});  // NHWC expected shape
+  framework::DDim expected_dims({1, 512, 3, 7});  // NCHW expected shape
   platform::CPUPlace p;
   framework::Scope scope;
 
diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc
index c7fb92cd5107c..9c16ccb138f7d 100644
--- a/paddle/fluid/operators/mode_op.cc
+++ b/paddle/fluid/operators/mode_op.cc
@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/mode_op.h"
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -23,43 +27,6 @@ class ModeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode");
-
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_EQ(
-        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
-        paddle::platform::errors::InvalidArgument(
-            "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
-            dim_size, dim_size, axis));
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of ModeOp must have >= 1d shape"));
-    if (axis < 0) axis += dim_size;
-    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
-    std::vector<int64_t> dimvec;
-    for (int64_t i = 0; i < axis; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    if (keepdim) {
-      dimvec.emplace_back(static_cast<int64_t>(1));
-    }
-    for (int64_t i = axis + 1; i < dim_size; i++) {
-      dimvec.emplace_back(input_dims[i]);
-    }
-    framework::DDim dims = phi::make_ddim(dimvec);
-    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
-                                                "input shape should >= 1d"));
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -138,18 +105,11 @@ class ModeGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(mode, ModeInferShapeFunctor,
+                            PD_INFER_META(phi::ModeInferMeta));
 REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker,
                   ops::ModeGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ModeGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(mode,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, float>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, double>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int32_t>,
-                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int64_t>);
-
+                  ops::ModeGradOpMaker<paddle::imperative::OpBase>,
+                  ModeInferShapeFunctor);
 REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    mode_grad, ops::ModeGradCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, double>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
-    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu
deleted file mode 100644
index 2bacda8afb0eb..0000000000000
--- a/paddle/fluid/operators/mode_op.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/inner_product.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mode_op.h"
-#include "paddle/fluid/operators/top_k_function_cuda.h"
-
-namespace paddle {
-namespace operators {
-
-int ComputeBlockSize(int col) {
-  if (col > 512)
-    return 1024;
-  else if (col > 256 && col <= 512)
-    return 512;
-  else if (col > 128 && col <= 256)
-    return 256;
-  else if (col > 64 && col <= 128)
-    return 128;
-  else
-    return 64;
-}
-
-template <typename T>
-void getModebySort(const platform::CUDADeviceContext& ctx,
-                   const framework::Tensor* input_tensor,
-                   const int64_t num_cols, const int64_t num_rows,
-                   T* out_tensor, int64_t* indices_tensor) {
-  framework::Tensor input_tmp;
-  framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp);
-  T* input_tmp_data = input_tmp.mutable_data<T>(ctx.GetPlace());
-  input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
-  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
-  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
-
-  for (int64_t i = 0; i < num_rows; ++i) {
-    T* begin = input_tmp_data + num_cols * i;
-    T* end = input_tmp_data + num_cols * (i + 1);
-    thrust::device_vector<int64_t> indices_data(num_cols);
-    thrust::sequence(thrust::device, indices_data.begin(),
-                     indices_data.begin() + num_cols);
-    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
-    int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1,
-                                           begin + 1, 0, thrust::plus<int>(),
-                                           thrust::not_equal_to<T>());
-    thrust::device_vector<T> keys_data(unique);
-    thrust::device_vector<int64_t> cnts_data(unique);
-    thrust::reduce_by_key(thrust::device, begin, end,
-                          thrust::constant_iterator<int>(1), keys_data.begin(),
-                          cnts_data.begin());
-    auto it = thrust::max_element(thrust::device, cnts_data.begin(),
-                                  cnts_data.begin() + unique);
-    T mode = keys_data[it - cnts_data.begin()];
-    int64_t counts = cnts_data[it - cnts_data.begin()];
-    auto pos = thrust::find(thrust::device, begin, end, mode);
-    int64_t index = indices_data[pos - begin + counts - 1];
-    out_tensor_ptr[i] = static_cast<T>(mode);
-    indices_tensor_ptr[i] = static_cast<int64_t>(index);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ModeOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
-    int axis = static_cast<int>(ctx.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
-
-    // get the input dims
-    const auto& in_dims = input->dims();
-    // calcluate the real axis
-    if (axis < 0) axis += in_dims.size();
-
-    auto out_dims = output->dims();
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
-      getModebySort<T>(dev_ctx, input, input_width, input_height, output_data,
-                       indices_data);
-    } else {
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dim);
-        indices->Resize(tmp_out_dim);
-      }
-
-      framework::DDim trans_shape(in_dims);
-      framework::DDim trans_out_shape(in_dims);
-      for (int i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = in_dims[trans_axis[i]];
-        trans_out_shape[i] = in_dims[trans_axis[i]];
-      }
-      trans_out_shape[in_dims.size() - 1] = 1;
-
-      // second step, tranpose the input
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_shape, ctx.GetPlace());
-      int ndims = trans_axis.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
-                                                   &trans_input, trans_axis);
-      framework::Tensor trans_ind;
-      int64_t* trans_ind_data =
-          trans_ind.mutable_data<int64_t>(trans_out_shape, ctx.GetPlace());
-      framework::Tensor trans_out;
-      T* trans_out_data =
-          trans_out.mutable_data<T>(trans_out_shape, ctx.GetPlace());
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
-      const int64_t input_width = trans_shape[trans_shape.size() - 1];
-      getModebySort<T>(dev_ctx, &trans_input, input_width, input_height,
-                       trans_out_data, trans_ind_data);
-      // last step, tranpose back the indices and output
-      TransCompute<platform::CUDADeviceContext, int64_t>(
-          ndims, dev_ctx, trans_ind, indices, trans_axis);
-      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
-                                                   output, trans_axis);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ModeOpGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(context.GetPlace()), true,
-        platform::errors::InvalidArgument(
-            "It must use CUDAPlace, you must check your device set."));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = context.Attr<int>("axis");
-
-    const auto& in_dims = x->dims();
-    auto out_dims = indices->dims();
-
-    if (axis < 0) axis += in_dims.size();
-    // allocate the cuda memory for the x_grad
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    const T* out_grad_data = out_grad->data<T>();
-    const int64_t* indices_data = indices->data<int64_t>();
-
-    int pre, n, post;
-    GetDims(in_dims, axis, &pre, &n, &post);
-
-    // calcluate the block and grid num
-    auto& dev_ctx = context.cuda_device_context();
-    int block_size = ComputeBlockSize(post);
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
-    int grid_size = std::min(max_blocks, pre);
-    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
-        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mode, ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    mode_grad,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/mode_op.h b/paddle/fluid/operators/mode_op.h
deleted file mode 100644
index 76d356ed16eb3..0000000000000
--- a/paddle/fluid/operators/mode_op.h
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename Type>
-static void getMode(Type input_height, Type input_width, int input_dim,
-                    const framework::Tensor* input, T* t_out, Type* t_indices) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    std::vector<std::pair<T, Type>> col_vec;
-    col_vec.reserve(input_width);
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
-      }
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      for (Type j = 0; j < input_width; ++j) {
-        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
-      }
-    }
-    std::sort(col_vec.begin(), col_vec.end(),
-              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
-                return (!std::isnan(static_cast<double>(l.first)) &&
-                        std::isnan(static_cast<double>(r.first))) ||
-                       (l.first < r.first);
-              });
-    T mode = 0;
-    int64_t indice = 0;
-    int64_t cur_freq = 0;
-    int64_t max_freq = 0;
-    for (int64_t i = 0; i < input_width; ++i) {
-      ++cur_freq;
-      if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
-        if (cur_freq > max_freq) {
-          max_freq = cur_freq;
-          mode = col_vec[i].first;
-          indice = col_vec[i].second;
-        }
-        cur_freq = 0;
-      }
-    }
-    t_out[i] = mode;
-    t_indices[i] = indice;
-  }
-}
-
-template <typename T, typename Type>
-static void ModeAssign(const Type& input_height, const Type& input_width,
-                       const int& input_dim, const framework::Tensor* input,
-                       const framework::Tensor* indices, T* output_data) {
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (Type i = 0; i < input_height; ++i) {
-    if (input_dim == 1) {
-      auto e_input = framework::EigenVector<T>::Flatten(*input);
-      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
-      output_data[i * input_width + e_indices(0)] = e_input(0);
-    } else {
-      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
-      auto e_indices =
-          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
-      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class ModeCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("X");
-    auto* output = context.Output<framework::Tensor>("Out");
-    auto* indices = context.Output<framework::Tensor>("Indices");
-    const auto& in_dims = input->dims();
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-
-    // axis < 0, cacluate the real axis
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    if (axis < 0) axis += in_dims.size();
-
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
-    auto out_dims = output->dims();
-    // if axis is not the last dim, transpose it to the last dim, do the
-    // calculation,
-    // then tranpose it back to orginal axis.
-    if (axis == in_dims.size() - 1) {
-      const int64_t& input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t& input_width = in_dims[in_dims.size() - 1];
-      getMode<T, int64_t>(input_height, input_width, in_dims.size(), input,
-                          output_data, indices_data);
-    } else {
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-
-      if (!keepdim) {
-        std::vector<int> tmp_out_shape;
-        for (int i = 0; i < axis; i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        tmp_out_shape.emplace_back(1);
-        for (int i = axis + 1; i < in_dims.size(); i++) {
-          tmp_out_shape.emplace_back(in_dims[i]);
-        }
-        framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
-        output->Resize(tmp_out_dim);
-        indices->Resize(tmp_out_dim);
-      }
-
-      // get the trans input_dims, out_dims
-      framework::DDim trans_shape(in_dims);
-      framework::DDim trans_out_shape(in_dims);
-
-      for (size_t i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = in_dims[trans_axis[i]];
-        trans_out_shape[i] = in_dims[trans_axis[i]];
-      }
-      trans_out_shape[in_dims.size() - 1] = 1;
-
-      framework::Tensor trans_input;
-      trans_input.mutable_data<T>(trans_shape, context.GetPlace());
-      int ndims = trans_axis.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      // transpose the input value
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
-                                                  &trans_input, trans_axis);
-
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
-      const int64_t input_width = trans_shape[trans_shape.size() - 1];
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_out_shape, context.GetPlace());
-      framework::Tensor tmp_indices;
-      auto* t_ind = tmp_indices.mutable_data<int64_t>(trans_out_shape,
-                                                      context.GetPlace());
-
-      getMode<T, int64_t>(input_height, input_width, in_dims.size(),
-                          &trans_input, t_out, t_ind);
-      // transpose back
-      TransCompute<platform::CPUDeviceContext, int64_t>(
-          ndims, dev_context, tmp_indices, indices, trans_axis);
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  output, trans_axis);
-      if (!keepdim) {
-        output->Resize(out_dims);
-        indices->Resize(out_dims);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ModeGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<framework::Tensor>("Indices");
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
-
-    auto in_dims = x->dims();
-    auto out_dims = indices->dims();
-
-    // axis < 0, get the real axis
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    if (!keepdim) {
-      std::vector<int> tmp_out_shape;
-      for (int i = 0; i < axis; i++) {
-        tmp_out_shape.emplace_back(out_dims[i]);
-      }
-      tmp_out_shape.emplace_back(1);
-      for (int i = axis + 1; i < in_dims.size(); i++) {
-        tmp_out_shape.emplace_back(out_dims[i - 1]);
-      }
-      out_dims = phi::make_ddim(tmp_out_shape);
-    }
-    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
-    if (axis == in_dims.size() - 1) {
-      // allocate the memory for the input_grad
-      // assign the out_grad to input_grad directly
-      const int64_t input_height =
-          phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t input_width = in_dims[in_dims.size() - 1];
-
-      // init the output grad with 0, because some input elements has no grad
-      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
-      // Assign the output_grad to input_grad
-      if (keepdim) {
-        ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices,
-                   x_grad_data);
-      } else {
-        auto& dev_context =
-            context.template device_context<platform::CPUDeviceContext>();
-        framework::Tensor out_grad_tmp;
-        framework::Tensor indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
-                   &indices_tmp, x_grad_data);
-      }
-    } else {
-      // can not assign grad to input_grad, must do the transpose
-      std::vector<int> trans_axis;
-      for (int i = 0; i < axis; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(out_dims.size() - 1);
-      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
-        trans_axis.emplace_back(i);
-      }
-      trans_axis.emplace_back(axis);
-      framework::DDim trans_shape(out_dims);
-      framework::DDim trans_in_shape(in_dims);
-      for (size_t i = 0; i < trans_axis.size(); i++) {
-        trans_shape[i] = out_dims[trans_axis[i]];
-        trans_in_shape[i] = in_dims[trans_axis[i]];
-      }
-      // transpose the out_grad, indices
-      framework::Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_shape, context.GetPlace());
-      framework::Tensor trans_ind;
-      trans_ind.mutable_data<int64_t>(trans_shape, context.GetPlace());
-      int ndims = trans_axis.size();
-      auto& dev_context =
-          context.template device_context<platform::CPUDeviceContext>();
-
-      if (keepdim) {
-        // Do transpose
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, *out_grad, &trans_dO, trans_axis);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, *indices, &trans_ind, trans_axis);
-      } else {
-        framework::Tensor out_grad_tmp;
-        framework::Tensor indices_tmp;
-        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
-        indices_tmp.mutable_data<int64_t>(indices->dims(),
-                                          dev_context.GetPlace());
-        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
-                              &out_grad_tmp);
-        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
-                              &indices_tmp);
-        out_grad_tmp.Resize(out_dims);
-        indices_tmp.Resize(out_dims);
-        // Do transpose
-        TransCompute<platform::CPUDeviceContext, T>(
-            ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis);
-        TransCompute<platform::CPUDeviceContext, int64_t>(
-            ndims, dev_context, indices_tmp, &trans_ind, trans_axis);
-      }
-      const int64_t input_height = phi::product(
-          phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
-      const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
-
-      // Assign the out_grad to tranpose input_grad
-      framework::Tensor tmp_out;
-      T* t_out = tmp_out.mutable_data<T>(trans_in_shape, context.GetPlace());
-      memset(t_out, 0, x_grad->numel() * sizeof(T));
-
-      ModeAssign<T, int64_t>(input_height, input_width, in_dims.size(),
-                             &trans_dO, &trans_ind, t_out);
-
-      // Transpose back
-      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
-                                                  x_grad, trans_axis);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index b309e1b87ef90..5b107ce643df3 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -16,77 +16,19 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 
-/**
- * @brief compute the output shape and check the input shape valid or not
- */
-inline framework::DDim ComputeAndCheckShape(
-    const bool is_runtime, const std::vector<framework::DDim>& inputs_dims) {
-  const size_t n = inputs_dims.size();
-  auto first_dim = inputs_dims[0];
-
-  bool is_vector = false;
-  framework::DDim out_dim;
-
-  PADDLE_ENFORCE_LT(
-      first_dim.size(), static_cast<size_t>(3),
-      platform::errors::InvalidArgument(
-          "multi_dot: the first input tensor must be 1D or 2D but got[%d]!",
-          static_cast<int>(first_dim.size())));
-
-  // If the first tensor is 1D of size n view it as a row vector (1, n)
-  if (first_dim.size() == 1) {
-    first_dim = phi::make_ddim({1, static_cast<int>(first_dim[0])});
-    is_vector = true;
-  }
-
-  auto last_dim = inputs_dims[n - 1];
-  PADDLE_ENFORCE_LT(
-      last_dim.size(), static_cast<size_t>(3),
-      platform::errors::InvalidArgument(
-          "the last input tensor of multi_dot must be 1D or 2D but got[%d]!",
-          static_cast<int>(first_dim.size())));
-
-  // If the last tensor is 1D of size n view it as a column vector (n, 1)
-  if (last_dim.size() == 1) {
-    last_dim = phi::make_ddim({static_cast<int>(last_dim[0]), 1});
-    out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]});
-  } else {
-    out_dim = is_vector ? phi::make_ddim({last_dim[1]})
-                        : phi::make_ddim({first_dim[0], last_dim[1]});
-  }
-
-  auto width = first_dim[1];
-  for (size_t i = 1; i < n - 1; i++) {
-    PADDLE_ENFORCE_EQ(inputs_dims[i].size(), static_cast<size_t>(2),
-                      platform::errors::InvalidArgument(
-                          "the input tensor of multi_dot op must be 2D."));
-
-    const auto& tmp_dim = inputs_dims[i];
-    PADDLE_ENFORCE_EQ(
-        tmp_dim[0], width,
-        platform::errors::InvalidArgument(
-            "the input matrix does not meet the multiplication requirements."));
-    width = tmp_dim[1];
-  }
-
-  PADDLE_ENFORCE_EQ(
-      last_dim[0], width,
-      platform::errors::InvalidArgument(
-          "the input matrix does not meet the multiplication requirements."));
-
-  return out_dim;
-}
-
 class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -105,22 +47,6 @@ If the first argument is 1-D it is treated as a row vector. If the last argument
 class MultiDotOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "multi_dot");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "multi_dot");
-
-    auto inputs_dims = ctx->GetInputsDim("X");
-
-    const size_t inputs_num = inputs_dims.size();
-    PADDLE_ENFORCE_GT(
-        inputs_num, static_cast<size_t>(1),
-        platform::errors::InvalidArgument(
-            "The number of input tensors in multi_dot op should > 1."));
-    auto out_dims = ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 class MultiDotOpGrad : public framework::OperatorWithKernel {
@@ -171,9 +97,15 @@ class MultiDotOpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(multi_dot, MultiDotInferShapeFunctor,
+                            PD_INFER_META(phi::MultiDotInferMeta));
+
 REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker,
                   ops::MultiDotOpGradMaker<paddle::framework::OpDesc>,
-                  ops::MultiDotOpGradMaker<paddle::imperative::OpBase>);
+                  ops::MultiDotOpGradMaker<paddle::imperative::OpBase>,
+                  MultiDotInferShapeFunctor);
+
 REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad,
                   ops::MultiDotOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MultiDotOpDoubleGradMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 313a479ea301b..8771a6573cba0 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/multiplex_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
                   ops::MultiplexGradMaker<paddle::framework::OpDesc>,
                   ops::MultiplexGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
-REGISTER_OP_CPU_KERNEL(
-    multiplex,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
deleted file mode 100644
index 0a32ee96fb693..0000000000000
--- a/paddle/fluid/operators/multiplex_op.cu
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/multiplex_op.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename Place, typename T>
-class MultiplexGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          ins[i]->numel(), 0,
-          platform::errors::OutOfRange(
-              "indexing will be out of bounds with size 0 for the %d-th input.",
-              i));
-    }
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
-                                  "index must be nonnegative."));
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        platform::errors::PreconditionNotMet(
-                            "index exceeds the number of candidate tensors."));
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
-    }
-  }
-};
-
-template <typename Place, typename T>
-class MultiplexGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<Tensor>("Ids");
-    auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<Place>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-
-        idx = i;
-      }
-    }
-
-    if (idx == -1UL) return;
-
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    // copy index to cpu
-    Tensor index_t_cpu;
-    paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
-    auto* index = index_t_cpu.data<int32_t>();
-
-    auto stream = ctx.cuda_device_context().stream();
-    platform::CUDAPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    multiplex,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h
deleted file mode 100644
index 1d0a009edeedc..0000000000000
--- a/paddle/fluid/operators/multiplex_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MultiplexCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto ids = ctx.Input<framework::Tensor>("Ids");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    for (size_t i = 0; i < ins.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          ins[i]->numel(), 0,
-          platform::errors::OutOfRange(
-              "indexing will be out of bounds with size 0 for the %d-th input.",
-              i));
-    }
-
-    auto rows = ins[0]->dims()[0];
-    auto cols = ins[0]->numel() / rows;
-    auto index = ids->data<int32_t>();
-    platform::CPUPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      int32_t k = index[i];
-      PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet(
-                                  "index must be nonnegative."));
-      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
-                        platform::errors::PreconditionNotMet(
-                            "index exceeds the number of candidate tensors."));
-      memory::Copy(place, out->data<T>() + i * cols, place,
-                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MultiplexGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* ids = ctx.Input<framework::Tensor>("Ids");
-    auto d_ins =
-        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-
-    size_t idx = -1UL;
-    for (size_t i = 0; i < d_ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
-        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
-            t.constant(static_cast<T>(0));
-
-        idx = i;
-      }
-    }
-
-    if (idx == -1UL) return;
-
-    auto rows = d_ins[idx]->dims()[0];
-    auto cols = d_ins[idx]->numel() / rows;
-    auto* index = ids->data<int32_t>();
-    platform::CPUPlace place = ctx.GetPlace();
-    for (auto i = 0; i < rows; i++) {
-      size_t k = static_cast<size_t>(index[i]);
-      if (d_ins[k]) {
-        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                     d_out->data<T>() + i * cols, cols * sizeof(T));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 5d394424d54f5..51daccce0e882 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -57,21 +59,7 @@ where, $\sum {x^2}$ is calculated along the `axis` dimension.
 };
 
 class NormOp : public framework::OperatorWithKernel {
- public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NormOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NormOp");
-    auto xdim = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", xdim);
-
-    if (ctx->Attrs().Get<bool>("is_test") == false) {
-      int axis = ctx->Attrs().Get<int>("axis");
-      if (axis < 0) axis = xdim.size() + axis;
-      xdim[axis] = 1;
-      ctx->SetOutputDim("Norm", xdim);
-    }
-  }
 };
 
 class NormOpGrad : public framework::OperatorWithKernel {
@@ -111,7 +99,11 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
+DECLARE_INFER_SHAPE_FUNCTOR(norm, NormInferShapeFunctor,
+                            PD_INFER_META(phi::NormInferMeta));
+
 REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker,
                   ops::NormOpGradOpMaker<paddle::framework::OpDesc>,
-                  ops::NormOpGradOpMaker<paddle::imperative::OpBase>);
+                  ops::NormOpGradOpMaker<paddle::imperative::OpBase>,
+                  NormInferShapeFunctor);
 REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc
new file mode 100644
index 0000000000000..8f7a3b82acf19
--- /dev/null
+++ b/paddle/fluid/operators/number_count_op.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/number_count_op.h"
+
+namespace paddle {
+namespace operators {
+
+class NumberCountOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("gate_idx"), "Input", "gate_idx",
+                   "NumberCount");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "number_count",
+                   "NumberCount");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // the dtype of the gate_idx should be same as int64
+    auto gate_idx_dtype =
+        OperatorWithKernel::IndicateVarDataType(ctx, "gate_idx");
+
+    PADDLE_ENFORCE_EQ(gate_idx_dtype, framework::proto::VarType::INT64,
+                      platform::errors::InvalidArgument(
+                          "The dtype of the gate_idx_dtype should be int64"));
+    return framework::OpKernelType(gate_idx_dtype, ctx.GetPlace());
+  }
+};
+
+class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("gate_idx", "(Tensor) The input gate index tensor.");
+    AddOutput("Out", "(Tensor) The output expert count tensor.");
+    AddAttr<int>("upper_range", "（int), The number of experts.");
+
+    AddComment(R"DOC(number_count Operator.count gate indices.)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CPU_KERNEL(number_count, ops::NumberCountOpCPUKernel<int>,
+                       ops::NumberCountOpCPUKernel<int64_t>);
+
+REGISTER_OP_WITHOUT_GRADIENT(number_count, ops::NumberCountOp,
+                             ops::NumberCountOpMaker);
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
new file mode 100644
index 0000000000000..97e4b4f2845ae
--- /dev/null
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/number_count_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+#define CEIL(_x_, _y_) (((_x_)-1) / (_y_) + 1)
+#define PERTHREAD_EXPERTS 256
+#define WARP_SIZE 32
+
+const int CUDA_NUM_THREADS = 512;
+static inline int GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename T>
+__global__ void initialize_zero_kernel(T* data, const int length) {
+  CUDA_KERNEL_LOOP(idx, length) { data[idx] = static_cast<T>(0); }
+}
+
+template <typename T>
+__global__ void NumberCount(const T* gate_idx, T* number_count,
+                            int64_t batch_size, int upper_range) {
+  int res_tmp[PERTHREAD_EXPERTS] = {0};
+  int expert_min = blockIdx.x * PERTHREAD_EXPERTS;
+  int expert_max = expert_min + PERTHREAD_EXPERTS;
+  if (expert_max > upper_range) {
+    expert_max = upper_range;
+  }
+  for (int i = threadIdx.x; i < batch_size; i += blockDim.x) {
+    T idx = gate_idx[i];
+    if (idx == -1) {
+      continue;
+    }
+    if (idx < expert_min || idx >= expert_max) {
+      continue;
+    }
+    res_tmp[idx - expert_min] += 1;
+  }
+  for (int i = expert_min; i < expert_max; ++i) {
+    int x = res_tmp[i - expert_min];
+#pragma unroll
+    for (int j = 1; j < WARP_SIZE; j <<= 1) {
+#ifdef __HIPCC__
+      x = x + __shfl_down(x, j);
+#else
+      x = x + __shfl_down_sync(-1u, x, j);
+#endif
+    }
+    if (threadIdx.x % WARP_SIZE == 0) {
+      platform::CudaAtomicAdd(number_count + i, x);
+    }
+  }
+}
+
+template <typename T>
+class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto gate_idx = context.Input<LoDTensor>("gate_idx");
+    auto upper_range = context.Attr<int>("upper_range");
+    auto number_count = context.Output<LoDTensor>("Out");
+
+    int64_t batch_size = gate_idx->numel();
+    auto place = context.GetPlace();
+    const auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+
+    framework::DDim out_dims = phi::make_ddim({upper_range});
+    auto out_data = number_count->mutable_data<T>(out_dims, place);
+    const T* gate_data = gate_idx->data<T>();
+
+    initialize_zero_kernel<
+        T><<<GET_BLOCKS(upper_range), CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
+        out_data, upper_range);
+
+    NumberCount<
+        T><<<CEIL(upper_range, PERTHREAD_EXPERTS), 256, 0, dev_ctx.stream()>>>(
+        gate_data, out_data, batch_size, upper_range);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(number_count, ops::NumberCountOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h
new file mode 100644
index 0000000000000..95e64946fb8a2
--- /dev/null
+++ b/paddle/fluid/operators/number_count_op.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_GLOO)
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class NumberCountOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support expert count op for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc
index e212f4e7e2b7d..122b6a8a80aac 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
@@ -12,9 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/one_hot_v2_op.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,26 +26,6 @@ namespace operators {
 class OneHotV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "one_hot_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "one_hot_v2");
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Rank of Input(X) should be at least 1."));
-
-    int depth = ctx->Attrs().Get<int>("depth");
-    if (ctx->HasInput("depth_tensor")) {
-      depth = -1;
-    }
-
-    auto out_dims_vec = phi::vectorize(x_dims);
-    out_dims_vec.push_back(depth);
-    auto out_dims = phi::make_ddim(out_dims_vec);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /* --> */ "Out");
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -52,7 +36,7 @@ class OneHotV2Op : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
+      const std::string& var_name, const framework::Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "depth_tensor") {
       return expected_kernel_type;
@@ -114,10 +98,12 @@ Out is a LoDTensor:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(one_hot_v2, OneHotInferShapeFunctor,
+                            PD_INFER_META(phi::OneHotRawInferMeta));
+
 REGISTER_OPERATOR(
     one_hot_v2, ops::OneHotV2Op, ops::OneHotV2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    one_hot_v2, ops::OneHotV2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::OneHotV2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    OneHotInferShapeFunctor);
diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu
deleted file mode 100644
index 77e2a931e50de..0000000000000
--- a/paddle/fluid/operators/one_hot_v2_op.cu
+++ /dev/null
@@ -1,100 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/one_hot_v2_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename InT, typename OutT>
-__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
-                                 const int64_t numel, const int depth) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) {
-    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
-  }
-}
-
-template <typename DeviceContext, typename InT>
-struct OneHotV2OpCUDAFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  const DeviceContext& ctx_;
-  int depth_;
-
-  OneHotV2OpCUDAFunctor(const framework::LoDTensor* in,
-                        framework::LoDTensor* out, int depth,
-                        const DeviceContext& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    auto stream = ctx_.stream();
-    phi::funcs::set_constant(ctx_, out_, 0.0);
-
-    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
-                           PADDLE_CUDA_NUM_THREADS,
-                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        p_in_data, p_out_data, numel, depth_);
-  }
-};
-
-using LoDTensor = framework::LoDTensor;
-template <typename DeviceContext, typename T>
-class OneHotV2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-
-    int depth = -1;
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<framework::Tensor>("depth_tensor");
-      if (platform::is_gpu_place(depth_tensor->place())) {
-        framework::Tensor temp;
-        paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(),
-                                          &temp);
-        depth = *temp.data<int32_t>();
-      } else {
-        depth = *depth_tensor->data<int32_t>();
-      }
-
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    } else {
-      depth = context.Attr<int>("depth");
-    }
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotV2OpCUDAFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>()));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    one_hot_v2,
-    ops::OneHotV2CUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::OneHotV2CUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc
index acf6baf50b418..e5702a37bb2b4 100644
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/one_hot_v2_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 
 template <typename T>
 class OneHotV2NPUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/overlap_add_op.cc b/paddle/fluid/operators/overlap_add_op.cc
index adae2c8f8adaa..0e6f0f8422106 100644
--- a/paddle/fluid/operators/overlap_add_op.cc
+++ b/paddle/fluid/operators/overlap_add_op.cc
@@ -54,6 +54,7 @@ class OverlapAddOp : public framework::OperatorWithKernel {
     std::vector<int64_t> output_shape;
     int n_frames;
     int frame_length;
+    int seq_length;
 
     int start_axis;
     int end_axis;
@@ -69,14 +70,22 @@ class OverlapAddOp : public framework::OperatorWithKernel {
       end_axis = x_rank - 3;
     }
 
-    PADDLE_ENFORCE_LE(
-        hop_length, frame_length,
-        platform::errors::InvalidArgument(
-            "Attribute(hop_length) of OverlapAddOp should be less or equal "
-            "than frame_length, but got hop_length(%s) > frame_length(%s).",
-            hop_length, frame_length));
+    bool contain_unknown_dim = phi::contain_unknown_dim(x_dims);
+    bool check = ctx->IsRuntime() || !contain_unknown_dim;
+    if (check) {
+      PADDLE_ENFORCE_LE(
+          hop_length, frame_length,
+          platform::errors::InvalidArgument(
+              "Attribute(hop_length) of OverlapAddOp should be less or equal "
+              "than frame_length, but got hop_length(%s) > frame_length(%s).",
+              hop_length, frame_length));
+    }
 
-    const int seq_length = (n_frames - 1) * hop_length + frame_length;
+    if (n_frames == -1) {
+      seq_length = -1;
+    } else {
+      seq_length = (n_frames - 1) * hop_length + frame_length;
+    }
 
     // It won't go into for loop when x_rank == 2U.
     for (int i = start_axis; i <= end_axis; i++) {
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
index 7b9a4ab1557bf..e4952a243262b 100644
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -24,734 +26,10 @@ namespace operators {
 
 using framework::Tensor;
 
-template <typename T>
-void ConstPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                         const int in_height, const int in_width,
-                         const int out_depth, const int out_height,
-                         const int out_width, const int pad_front,
-                         const int pad_top, const int pad_left, const int out_d,
-                         const int out_h, const int out_w, const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-       in_h >= in_height || in_w >= in_width)
-          ? value
-          : in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void ConstPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                         const int in_depth, const int in_height,
-                         const int in_width, const int out_depth,
-                         const int out_height, const int out_width,
-                         const int pad_front, const int pad_top,
-                         const int pad_left, const int out_d, const int out_h,
-                         const int out_w, const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-      in_h >= in_height || in_w >= in_width) {
-    for (int c = 0; c < channels; ++c) {
-      out_data[out_index + c] = value;
-    }
-  } else {
-    const int in_index =
-        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-    for (int c = 0; c < channels; ++c) {
-      out_data[out_index + c] = in_data[in_index + c];
-    }
-  }
-}
-
-template <typename T>
-void ReflectPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                           const int in_height, const int in_width,
-                           const int out_depth, const int out_height,
-                           const int out_width, const int pad_front,
-                           const int pad_top, const int pad_left,
-                           const int out_d, const int out_h, const int out_w,
-                           const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);                     // reflect by 0
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
-  in_h = std::max(in_h, -in_h);                     // reflect by 0
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-  in_w = std::max(in_w, -in_w);                     // reflect by 0
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
-
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void ReflectPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                           const int in_depth, const int in_height,
-                           const int in_width, const int out_depth,
-                           const int out_height, const int out_width,
-                           const int pad_front, const int pad_top,
-                           const int pad_left, const int out_d, const int out_h,
-                           const int out_w, const T value) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
-  in_h = std::max(in_h, -in_h);
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);
-  in_w = std::max(in_w, -in_w);
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    out_data[out_index + c] = in_data[in_index + c];
-  }
-}
-
-template <typename T>
-void ReplicatePad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                             const int in_height, const int in_width,
-                             const int out_depth, const int out_height,
-                             const int out_width, const int pad_front,
-                             const int pad_top, const int pad_left,
-                             const int out_d, const int out_h, const int out_w,
-                             const T value) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void ReplicatePad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                             const int in_depth, const int in_height,
-                             const int in_width, const int out_depth,
-                             const int out_height, const int out_width,
-                             const int pad_front, const int pad_top,
-                             const int pad_left, const int out_d,
-                             const int out_h, const int out_w, const T value) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    out_data[out_index + c] = in_data[in_index + c];
-  }
-}
-
-template <typename T>
-void CircularPad3DFuncNCDHW(const T* in_data, T* out_data, const int in_depth,
-                            const int in_height, const int in_width,
-                            const int out_depth, const int out_height,
-                            const int out_width, const int pad_front,
-                            const int pad_top, const int pad_left,
-                            const int out_d, const int out_h, const int out_w,
-                            const T value) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
-      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
-}
-
-template <typename T>
-void CircularPad3DFuncNDHWC(const T* in_data, T* out_data, const int channels,
-                            const int in_depth, const int in_height,
-                            const int in_width, const int out_depth,
-                            const int out_height, const int out_width,
-                            const int pad_front, const int pad_top,
-                            const int pad_left, const int out_d,
-                            const int out_h, const int out_w, const T value) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    out_data[out_index + c] = in_data[in_index + c];
-  }
-}
-
-template <typename T>
-void Pad3DNCDHW(const T* in_data, const int num, const int channels,
-                const int in_depth, const int in_height, const int in_width,
-                const int out_depth, const int out_height, const int out_width,
-                const int pad_front, const int pad_top, const int pad_left,
-                T value, T* out_data,
-                void (*pad_func)(const T*, T*, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const T)) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_d = 0; out_d < out_depth; ++out_d) {
-        for (int out_h = 0; out_h < out_height; ++out_h) {
-          for (int out_w = 0; out_w < out_width; ++out_w) {
-            pad_func(in_data, out_data, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, out_d, out_h, out_w, value);
-          }
-        }
-      }
-      in_data += in_depth * in_height * in_width;
-      out_data += out_depth * out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad3DNDHWC(const T* in_data, const int num, const int channels,
-                const int in_depth, const int in_height, const int in_width,
-                const int out_depth, const int out_height, const int out_width,
-                const int pad_front, const int pad_top, const int pad_left,
-                T value, T* out_data,
-                void (*pad_func)(const T*, T*, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const int, const int, const int,
-                                 const int, const int, const T)) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_d = 0; out_d < out_depth; ++out_d) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          pad_func(in_data, out_data, channels, in_depth, in_height, in_width,
-                   out_depth, out_height, out_width, pad_front, pad_top,
-                   pad_left, out_d, out_h, out_w, value);
-        }
-      }
-    }
-    in_data += in_depth * in_height * in_width * channels;
-    out_data += out_depth * out_height * out_width * channels;
-  }
-}
-
-template <typename T>
-void ConstPad3DGradNCDHW(T* d_in_data, const T* d_out_data, const int in_depth,
-                         const int in_height, const int in_width,
-                         const int out_depth, const int out_height,
-                         const int out_width, const int pad_front,
-                         const int pad_top, const int pad_left, const int out_d,
-                         const int out_h, const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-        in_h >= in_height || in_w >= in_width)) {
-    d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] =
-        d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-  }
-}
-
-template <typename T>
-void ConstPad3DGradNDHWC(T* d_in_data, const T* d_out_data, const int channels,
-                         const int in_depth, const int in_height,
-                         const int in_width, const int out_depth,
-                         const int out_height, const int out_width,
-                         const int pad_front, const int pad_top,
-                         const int pad_left, const int out_d, const int out_h,
-                         const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-        in_h >= in_height || in_w >= in_width)) {
-    const int in_index =
-        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-    for (int c = 0; c < channels; ++c) {
-      d_in_data[in_index + c] = d_out_data[out_index + c];
-    }
-  }
-}
-
-template <typename T>
-void ReflectPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
-                           const int in_depth, const int in_height,
-                           const int in_width, const int out_depth,
-                           const int out_height, const int out_width,
-                           const int pad_front, const int pad_top,
-                           const int pad_left, const int out_d, const int out_h,
-                           const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);                     // reflect by 0
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
-  in_h = std::max(in_h, -in_h);                     // reflect by 0
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-  in_w = std::max(in_w, -in_w);                     // reflect by 0
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
-
-  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
-      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-}
-
-template <typename T>
-void ReflectPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
-                           const int channels, const int in_depth,
-                           const int in_height, const int in_width,
-                           const int out_depth, const int out_height,
-                           const int out_width, const int pad_front,
-                           const int pad_top, const int pad_left,
-                           const int out_d, const int out_h, const int out_w) {
-  int in_d = out_d - pad_front;
-  int in_h = out_h - pad_top;
-  int in_w = out_w - pad_left;
-
-  in_d = std::max(in_d, -in_d);
-  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
-  in_h = std::max(in_h, -in_h);
-  in_h = std::min(in_h, 2 * in_height - in_h - 2);
-  in_w = std::max(in_w, -in_w);
-  in_w = std::min(in_w, 2 * in_width - in_w - 2);
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    d_in_data[in_index + c] += d_out_data[out_index + c];
-  }
-}
-
-template <typename T>
-void ReplicatePad3DGradNCDHW(T* d_in_data, const T* d_out_data,
-                             const int in_depth, const int in_height,
-                             const int in_width, const int out_depth,
-                             const int out_height, const int out_width,
-                             const int pad_front, const int pad_top,
-                             const int pad_left, const int out_d,
-                             const int out_h, const int out_w) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
-      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-}
-
-template <typename T>
-void ReplicatePad3DGradNDHWC(T* d_in_data, const T* d_out_data,
-                             const int channels, const int in_depth,
-                             const int in_height, const int in_width,
-                             const int out_depth, const int out_height,
-                             const int out_width, const int pad_front,
-                             const int pad_top, const int pad_left,
-                             const int out_d, const int out_h,
-                             const int out_w) {
-  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
-  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
-  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    d_in_data[in_index + c] += d_out_data[out_index + c];
-  }
-}
-
-template <typename T>
-void CircularPad3DGradNCDHW(T* d_in_data, const T* d_out_data,
-                            const int in_depth, const int in_height,
-                            const int in_width, const int out_depth,
-                            const int out_height, const int out_width,
-                            const int pad_front, const int pad_top,
-                            const int pad_left, const int out_d,
-                            const int out_h, const int out_w) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
-      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
-}
-
-template <typename T>
-void CircularPad3DGradNDHWC(T* d_in_data, const T* d_out_data,
-                            const int channels, const int in_depth,
-                            const int in_height, const int in_width,
-                            const int out_depth, const int out_height,
-                            const int out_width, const int pad_front,
-                            const int pad_top, const int pad_left,
-                            const int out_d, const int out_h, const int out_w) {
-  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-  const int out_index =
-      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
-  const int in_index =
-      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
-  for (int c = 0; c < channels; ++c) {
-    d_in_data[in_index + c] += d_out_data[out_index + c];
-  }
-}
-
-template <typename T>
-void Pad3DGradNCDHW(T* d_in_data, const int num, const int channels,
-                    const int in_depth, const int in_height, const int in_width,
-                    const int out_depth, const int out_height,
-                    const int out_width, const int pad_front, const int pad_top,
-                    const int pad_left, const T* d_out_data,
-                    void (*pad_func)(T*, const T*, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int)) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int out_d = 0; out_d < out_depth; ++out_d) {
-        for (int out_h = 0; out_h < out_height; ++out_h) {
-          for (int out_w = 0; out_w < out_width; ++out_w) {
-            pad_func(d_in_data, d_out_data, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, out_d, out_h, out_w);
-          }
-        }
-      }
-      d_in_data += in_depth * in_height * in_width;
-      d_out_data += out_depth * out_height * out_width;
-    }
-  }
-}
-
-template <typename T>
-void Pad3DGradNDHWC(T* d_in_data, const int num, const int channels,
-                    const int in_depth, const int in_height, const int in_width,
-                    const int out_depth, const int out_height,
-                    const int out_width, const int pad_front, const int pad_top,
-                    const int pad_left, const T* d_out_data,
-                    void (*pad_func)(T*, const T*, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int)) {
-  for (int n = 0; n < num; ++n) {
-    for (int out_d = 0; out_d < out_depth; ++out_d) {
-      for (int out_h = 0; out_h < out_height; ++out_h) {
-        for (int out_w = 0; out_w < out_width; ++out_w) {
-          pad_func(d_in_data, d_out_data, channels, in_depth, in_height,
-                   in_width, out_depth, out_height, out_width, pad_front,
-                   pad_top, pad_left, out_d, out_h, out_w);
-        }
-      }
-    }
-    d_in_data += in_depth * in_height * in_width * channels;
-    d_out_data += out_depth * out_height * out_width * channels;
-  }
-}
-
-static inline std::vector<int> GetPaddings(
-    const framework::ExecutionContext& context) {
-  std::vector<int> paddings(6);
-  auto* paddings_t = context.Input<Tensor>("Paddings");
-  if (paddings_t) {
-    auto paddings_data = paddings_t->data<int>();
-    std::memcpy(paddings.data(), paddings_data, paddings.size() * sizeof(int));
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings.data());
-  }
-  return paddings;
-}
-
-template <typename T>
-class Pad3dCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    T value = static_cast<T>(context.Attr<float>("value"));
-
-    auto* x = context.Input<Tensor>("X");
-    auto in_dims = x->dims();
-    const T* in_data = x->data<T>();
-
-    auto* out = context.Output<Tensor>("Out");
-    if (data_format == "NCDHW") {
-      out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[4] + pads[5],
-                   in_dims[3] + pads[2] + pads[3],
-                   in_dims[4] + pads[0] + pads[1]});
-    } else {
-      out->Resize({in_dims[0], in_dims[1] + pads[4] + pads[5],
-                   in_dims[2] + pads[2] + pads[3],
-                   in_dims[3] + pads[0] + pads[1], in_dims[4]});
-    }
-    auto out_dims = out->dims();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    int channels = in_dims[1];
-    int in_depth = in_dims[2];
-    int in_height = in_dims[3];
-    int in_width = in_dims[4];
-    int out_depth = out_dims[2];
-    int out_height = out_dims[3];
-    int out_width = out_dims[4];
-    if (data_format == "NDHWC") {
-      channels = in_dims[4];
-      in_depth = in_dims[1];
-      in_height = in_dims[2];
-      in_width = in_dims[3];
-      out_depth = out_dims[1];
-      out_height = out_dims[2];
-      out_width = out_dims[3];
-    }
-
-    if (mode == "reflect") {
-      PADDLE_ENFORCE_GT(in_depth, pads[4],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_front"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_front(%d).",
-                            in_depth, pads[4]));
-      PADDLE_ENFORCE_GT(in_depth, pads[5],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_back"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_back(%d).",
-                            in_depth, pads[5]));
-
-      PADDLE_ENFORCE_GT(in_height, pads[2],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_top"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_top(%d).",
-                            in_height, pads[2]));
-      PADDLE_ENFORCE_GT(in_height, pads[3],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_bottom"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_bottom(%d).",
-                            in_height, pads[3]));
-
-      PADDLE_ENFORCE_GT(in_width, pads[0],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_left"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_left(%d).",
-                            in_width, pads[0]));
-      PADDLE_ENFORCE_GT(in_width, pads[1],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_right"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_right(%d).",
-                            in_width, pads[1]));
-    } else if (mode == "circular" || mode == "replicate") {
-      PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0,
-                        platform::errors::InvalidArgument(
-                            "The input tensor size can not be 0 for circular "
-                            "or replicate padding mode."));
-    }
-
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-    const int num = in_dims[0];
-    if (data_format == "NCDHW") {
-      std::map<std::string,
-               void (*)(const T*, T*, const int, const int, const int,
-                        const int, const int, const int, const int, const int,
-                        const int, const int, const int, const int, const T)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DFuncNCDHW;
-      func_map["replicate"] = ReplicatePad3DFuncNCDHW;
-      func_map["circular"] = CircularPad3DFuncNCDHW;
-      func_map["constant"] = ConstPad3DFuncNCDHW;
-      Pad3DNCDHW(in_data, num, channels, in_depth, in_height, in_width,
-                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-                 value, out_data, func_map[mode]);
-    } else {
-      std::map<std::string, void (*)(const T*, T*, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const int,
-                                     const int, const int, const int, const T)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DFuncNDHWC;
-      func_map["replicate"] = ReplicatePad3DFuncNDHWC;
-      func_map["circular"] = CircularPad3DFuncNDHWC;
-      func_map["constant"] = ConstPad3DFuncNDHWC;
-      Pad3DNDHWC(in_data, num, channels, in_depth, in_height, in_width,
-                 out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-                 value, out_data, func_map[mode]);
-    }
-  }
-};
-
-template <typename T>
-class Pad3dGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    auto d_out_dims = d_out->dims();
-    const T* d_out_data = d_out->data<T>();
-    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(context.template device_context<platform::CPUDeviceContext>(),
-             d_in, static_cast<T>(0));
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-    const int num = d_in_dims[0];
-    if (data_format == "NCDHW") {
-      const int channels = d_in_dims[1];
-      const int in_depth = d_in_dims[2];
-      const int in_height = d_in_dims[3];
-      const int in_width = d_in_dims[4];
-      const int out_depth = d_out_dims[2];
-      const int out_height = d_out_dims[3];
-      const int out_width = d_out_dims[4];
-
-      std::map<std::string,
-               void (*)(T*, const T*, const int, const int, const int,
-                        const int, const int, const int, const int, const int,
-                        const int, const int, const int, const int)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DGradNCDHW;
-      func_map["replicate"] = ReplicatePad3DGradNCDHW;
-      func_map["circular"] = CircularPad3DGradNCDHW;
-      func_map["constant"] = ConstPad3DGradNCDHW;
-
-      Pad3DGradNCDHW(d_in_data, num, channels, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, d_out_data, func_map[mode]);
-    } else {
-      const int channels = d_in_dims[4];
-      const int in_depth = d_in_dims[1];
-      const int in_height = d_in_dims[2];
-      const int in_width = d_in_dims[3];
-      const int out_depth = d_out_dims[1];
-      const int out_height = d_out_dims[2];
-      const int out_width = d_out_dims[3];
-
-      std::map<std::string,
-               void (*)(T*, const T*, const int, const int, const int,
-                        const int, const int, const int, const int, const int,
-                        const int, const int, const int, const int, const int)>
-          func_map;
-
-      func_map["reflect"] = ReflectPad3DGradNDHWC;
-      func_map["replicate"] = ReplicatePad3DGradNDHWC;
-      func_map["circular"] = CircularPad3DGradNDHWC;
-      func_map["constant"] = ConstPad3DGradNDHWC;
-
-      Pad3DGradNDHWC(d_in_data, num, channels, in_depth, in_height, in_width,
-                     out_depth, out_height, out_width, pad_front, pad_top,
-                     pad_left, d_out_data, func_map[mode]);
-    }
-  }
-};
-
 class Pad3dOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad3d");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad3d");
-
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(x_dim.size(), 5,
-                      platform::errors::InvalidArgument(
-                          "The size of Input(X)'s dimension should be equal to "
-                          "5, but received %d. ",
-                          x_dim.size()));
-
-    std::vector<int64_t> out_dims(x_dim.size());
-    auto data_format = ctx->Attrs().Get<std::string>("data_format");
-    out_dims[0] = x_dim[0];
-    if (ctx->HasInput("Paddings")) {
-      auto paddings_dim = ctx->GetInputDim("Paddings");
-      PADDLE_ENFORCE_EQ(paddings_dim.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "Size of Input(Paddings)'s dimension should be "
-                            "equal to 1, but received %d.",
-                            paddings_dim.size()));
-      if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(paddings_dim[0], 6,
-                          platform::errors::InvalidArgument(
-                              "Shape of Input(Paddings) should be equal to "
-                              "[6], but received [%d].",
-                              paddings_dim[0]));
-      }
-      out_dims[1] = x_dim[1];
-      out_dims[2] = x_dim[2];
-      out_dims[3] = x_dim[3];
-    } else {
-      auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-      PADDLE_ENFORCE_EQ(
-          paddings.size(), 6,
-          platform::errors::InvalidArgument(
-              "Size of paddings should be equal to 4, but received %d.",
-              static_cast<int>(paddings.size())));
-      if (data_format == "NCDHW") {
-        out_dims[1] = x_dim[1];  // channel
-        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
-                          ? x_dim[2]
-                          : (x_dim[2] + paddings[4] + paddings[5]);  // depth
-
-        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
-                          ? x_dim[3]
-                          : (x_dim[3] + paddings[2] + paddings[3]);  // height
-
-        out_dims[4] = ((!ctx->IsRuntime()) && (x_dim[4] < 0))
-                          ? x_dim[4]
-                          : (x_dim[4] + paddings[0] + paddings[1]);  // width
-      } else {                                                       // NDHWC
-        out_dims[4] = x_dim[4];                                      // channel
-
-        out_dims[1] = ((!ctx->IsRuntime()) && (x_dim[1] < 0))
-                          ? x_dim[1]
-                          : (x_dim[1] + paddings[4] + paddings[5]);  // depth
-        out_dims[2] = ((!ctx->IsRuntime()) && (x_dim[2] < 0))
-                          ? x_dim[2]
-                          : (x_dim[2] + paddings[2] + paddings[3]);  // height
-        out_dims[3] = ((!ctx->IsRuntime()) && (x_dim[3] < 0))
-                          ? x_dim[3]
-                          : (x_dim[3] + paddings[0] + paddings[1]);  // width
-      }
-    }
-
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -921,15 +199,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(Pad3dOpGradNoNeedBufferVarsInferer, "X");
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(pad3d, Pad3dInferShapeFunctor,
+                            PD_INFER_META(phi::Pad3dInferMeta));
+
 REGISTER_OPERATOR(pad3d, ops::Pad3dOp, ops::Pad3dOpMaker,
                   ops::Pad3dOpGradMaker<paddle::framework::OpDesc>,
-                  ops::Pad3dOpGradMaker<paddle::imperative::OpBase>);
+                  ops::Pad3dOpGradMaker<paddle::imperative::OpBase>,
+                  Pad3dInferShapeFunctor);
 REGISTER_OPERATOR(pad3d_grad, ops::Pad3dOpGrad,
                   ops::Pad3dOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::Pad3dOpDoubleGradMaker<paddle::imperative::OpBase>,
                   ops::Pad3dOpGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(pad3d, ops::Pad3dCPUKernel<float>,
-                       ops::Pad3dCPUKernel<double>, ops::Pad3dCPUKernel<int>,
-                       ops::Pad3dCPUKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL(pad3d_grad, ops::Pad3dGradCPUKernel<float>,
-                       ops::Pad3dGradCPUKernel<double>);
diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu
deleted file mode 100644
index 9ab0eb9d445da..0000000000000
--- a/paddle/fluid/operators/pad3d_op.cu
+++ /dev/null
@@ -1,793 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-using framework::Tensor;
-
-template <typename T>
-__global__ void Pad3DConstNCDHW(const int nthreads, const T* in_data,
-                                const int num, const int channels,
-                                const int in_depth, const int in_height,
-                                const int in_width, const int out_depth,
-                                const int out_height, const int out_width,
-                                const int pad_front, const int pad_top,
-                                const int pad_left, T value, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-    out_data[index] =
-        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-         in_h >= in_height || in_w >= in_width)
-            ? value
-            : in_data[nc * in_depth * in_height * in_width +
-                      in_d * in_height * in_width + in_h * in_width + in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DConstNDHWC(const int nthreads, const T* in_data,
-                                const int num, const int channels,
-                                const int in_depth, const int in_height,
-                                const int in_width, const int out_depth,
-                                const int out_height, const int out_width,
-                                const int pad_front, const int pad_top,
-                                const int pad_left, T value, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-    const int in_d = out_d - pad_front;
-    const int in_h = out_h - pad_top;
-    const int in_w = out_w - pad_left;
-
-    out_data[index] =
-        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
-         in_h >= in_height || in_w >= in_width)
-            ? value
-            : in_data[n * in_depth * in_height * in_width * channels +
-                      in_d * in_height * in_width * channels +
-                      in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReflectNCDHW(const int nthreads, const T* in_data,
-                                  const int num, const int channels,
-                                  const int in_depth, const int in_height,
-                                  const int in_width, const int out_depth,
-                                  const int out_height, const int out_width,
-                                  const int pad_front, const int pad_top,
-                                  const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);                     // reflect by 0
-    in_d = min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
-    in_h = max(in_h, -in_h);                     // reflect by 0
-    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
-    in_w = max(in_w, -in_w);                     // reflect by 0
-    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
-    out_data[index] =
-        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
-                    in_width +
-                in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReflectNDHWC(const int nthreads, const T* in_data,
-                                  const int num, const int channels,
-                                  const int in_depth, const int in_height,
-                                  const int in_width, const int out_depth,
-                                  const int out_height, const int out_width,
-                                  const int pad_front, const int pad_top,
-                                  const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);
-    in_d = min(in_d, 2 * in_depth - in_d - 2);
-    in_h = max(in_h, -in_h);
-    in_h = min(in_h, 2 * in_height - in_h - 2);
-    in_w = max(in_w, -in_w);
-    in_w = min(in_w, 2 * in_width - in_w - 2);
-
-    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
-                              in_d * in_height * in_width * channels +
-                              in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReplicateNCDHW(const int nthreads, const T* in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    out_data[index] =
-        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
-                    in_width +
-                in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DReplicateNDHWC(const int nthreads, const T* in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
-                              in_d * in_height * in_width * channels +
-                              in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DCircularNCDHW(const int nthreads, const T* in_data,
-                                   const int num, const int channels,
-                                   const int in_depth, const int in_height,
-                                   const int in_width, const int out_depth,
-                                   const int out_height, const int out_width,
-                                   const int pad_front, const int pad_top,
-                                   const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int nc = index / out_width;
-
-    const int out_w = index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    out_data[index] =
-        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
-                    in_width +
-                in_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DCircularNDHWC(const int nthreads, const T* in_data,
-                                   const int num, const int channels,
-                                   const int in_depth, const int in_height,
-                                   const int in_width, const int out_depth,
-                                   const int out_height, const int out_width,
-                                   const int pad_front, const int pad_top,
-                                   const int pad_left, T* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index / channels;
-    const int c = index % channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
-                              in_d * in_height * in_width * channels +
-                              in_h * in_width * channels + in_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradConstNCDHW(const int in_size, T* d_in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(in_index, in_size) {
-    const int in_w = in_index % in_width;
-
-    int nc = in_index / in_width;
-    const int in_h = nc % in_height;
-
-    nc /= in_height;
-    const int in_d = nc % in_depth;
-
-    nc /= in_depth;
-
-    const int out_d = in_d + pad_front;
-    const int out_h = in_h + pad_top;
-    const int out_w = in_w + pad_left;
-    d_in_data[in_index] =
-        d_out_data[nc * out_depth * out_height * out_width +
-                   out_d * out_height * out_width + out_h * out_width + out_w];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradConstNDHWC(const int in_size, T* d_in_data,
-                                    const int num, const int channels,
-                                    const int in_depth, const int in_height,
-                                    const int in_width, const int out_depth,
-                                    const int out_height, const int out_width,
-                                    const int pad_front, const int pad_top,
-                                    const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(in_index, in_size) {
-    const int c = in_index % channels;
-    int n = in_index / channels;
-
-    const int in_w = n % in_width;
-    n /= in_width;
-
-    const int in_h = n % in_height;
-    n /= in_height;
-
-    const int in_d = n % in_depth;
-    n /= in_depth;
-
-    const int out_d = in_d + pad_front;
-    const int out_h = in_h + pad_top;
-    const int out_w = in_w + pad_left;
-
-    d_in_data[in_index] =
-        d_out_data[n * out_depth * out_height * out_width * channels +
-                   out_d * out_height * out_width * channels +
-                   out_h * out_width * channels + out_w * channels + c];
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReflectNCDHW(const int out_size, T* d_in_data,
-                                      const int num, const int channels,
-                                      const int in_depth, const int in_height,
-                                      const int in_width, const int out_depth,
-                                      const int out_height, const int out_width,
-                                      const int pad_front, const int pad_top,
-                                      const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);
-    in_h = max(in_h, -in_h);
-    in_w = max(in_w, -in_w);
-
-    in_d = min(in_d, 2 * in_depth - in_d - 2);
-    in_h = min(in_h, 2 * in_height - in_h - 2);
-    in_w = min(in_w, 2 * in_width - in_w - 2);
-
-    platform::CudaAtomicAdd(
-        &d_in_data[nc * in_depth * in_height * in_width +
-                   in_d * in_height * in_width + in_h * in_width + in_w],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReflectNDHWC(const int out_size, T* d_in_data,
-                                      const int num, const int channels,
-                                      const int in_depth, const int in_height,
-                                      const int in_width, const int out_depth,
-                                      const int out_height, const int out_width,
-                                      const int pad_front, const int pad_top,
-                                      const int pad_left, const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = out_d - pad_front;
-    int in_h = out_h - pad_top;
-    int in_w = out_w - pad_left;
-
-    in_d = max(in_d, -in_d);
-    in_h = max(in_h, -in_h);
-    in_w = max(in_w, -in_w);
-
-    in_d = min(in_d, in_depth * 2 - in_d - 2);
-    in_h = min(in_h, in_height * 2 - in_h - 2);
-    in_w = min(in_w, in_width * 2 - in_w - 2);
-    platform::CudaAtomicAdd(
-        &d_in_data[n * in_depth * in_height * in_width * channels +
-                   in_d * in_height * in_width * channels +
-                   in_h * in_width * channels + in_w * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReplicateNCDHW(
-    const int out_size, T* d_in_data, const int num, const int channels,
-    const int in_depth, const int in_height, const int in_width,
-    const int out_depth, const int out_height, const int out_width,
-    const int pad_front, const int pad_top, const int pad_left,
-    const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    platform::CudaAtomicAdd(
-        &d_in_data[nc * in_depth * in_height * in_width +
-                   in_d * in_height * in_width + in_h * in_width + in_w],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradReplicateNDHWC(
-    const int out_size, T* d_in_data, const int num, const int channels,
-    const int in_depth, const int in_height, const int in_width,
-    const int out_depth, const int out_height, const int out_width,
-    const int pad_front, const int pad_top, const int pad_left,
-    const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
-    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
-    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-
-    platform::CudaAtomicAdd(
-        &d_in_data[n * in_depth * in_height * in_width * channels +
-                   in_d * in_height * in_width * channels +
-                   in_h * in_width * channels + in_w * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradCircularNCDHW(const int out_size, T* d_in_data,
-                                       const int num, const int channels,
-                                       const int in_depth, const int in_height,
-                                       const int in_width, const int out_depth,
-                                       const int out_height,
-                                       const int out_width, const int pad_front,
-                                       const int pad_top, const int pad_left,
-                                       const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    int nc = out_index / out_width;
-    const int out_w = out_index % out_width;
-    const int out_h = nc % out_height;
-    nc /= out_height;
-    const int out_d = nc % out_depth;
-    nc /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    platform::CudaAtomicAdd(
-        &d_in_data[nc * in_depth * in_height * in_width +
-                   in_d * in_height * in_width + in_h * in_width + in_w],
-        d_out_data[out_index]);
-  }
-}
-
-template <typename T>
-__global__ void Pad3DGradCircularNDHWC(const int out_size, T* d_in_data,
-                                       const int num, const int channels,
-                                       const int in_depth, const int in_height,
-                                       const int in_width, const int out_depth,
-                                       const int out_height,
-                                       const int out_width, const int pad_front,
-                                       const int pad_top, const int pad_left,
-                                       const T* d_out_data) {
-  CUDA_KERNEL_LOOP(out_index, out_size) {
-    const int c = out_index % channels;
-    int n = out_index / channels;
-    const int out_w = n % out_width;
-    n /= out_width;
-    const int out_h = n % out_height;
-    n /= out_height;
-    const int out_d = n % out_depth;
-    n /= out_depth;
-
-    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
-    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
-    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
-
-    platform::CudaAtomicAdd(
-        &d_in_data[n * in_depth * in_height * in_width * channels +
-                   in_d * in_height * in_width * channels +
-                   in_h * in_width * channels + in_w * channels + c],
-        d_out_data[out_index]);
-  }
-}
-
-static inline std::vector<int> GetPaddings(
-    const framework::ExecutionContext& context) {
-  std::vector<int> paddings(6);
-  auto* paddings_data = context.Input<Tensor>("Paddings");
-  if (paddings_data) {
-    Tensor pads;
-    framework::TensorCopySync(*paddings_data, platform::CPUPlace(), &pads);
-    auto pads_data = pads.data<int>();
-    std::memcpy(paddings.data(), pads_data, paddings.size() * sizeof(int));
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings.data());
-  }
-  return paddings;
-}
-
-template <typename T>
-class Pad3dCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    T value = static_cast<T>(context.Attr<float>("value"));
-
-    auto* x = context.Input<Tensor>("X");
-    auto in_dims = x->dims();
-    const T* in_data = x->data<T>();
-    auto* out = context.Output<Tensor>("Out");
-    auto out_dims = out->dims();
-    if (data_format == "NCDHW") {
-      out_dims[0] = in_dims[0];
-      out_dims[1] = in_dims[1];
-      out_dims[2] = in_dims[2] + pads[4] + pads[5];
-      out_dims[3] = in_dims[3] + pads[2] + pads[3];
-      out_dims[4] = in_dims[4] + pads[0] + pads[1];
-    } else {
-      out_dims[0] = in_dims[0];
-      out_dims[1] = in_dims[1] + pads[4] + pads[5];
-      out_dims[2] = in_dims[2] + pads[2] + pads[3];
-      out_dims[3] = in_dims[3] + pads[0] + pads[1];
-      out_dims[4] = in_dims[4];
-    }
-    T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
-
-    int channels = in_dims[1];
-    int in_depth = in_dims[2];
-    int in_height = in_dims[3];
-    int in_width = in_dims[4];
-    int out_depth = out_dims[2];
-    int out_height = out_dims[3];
-    int out_width = out_dims[4];
-    if (data_format == "NDHWC") {
-      channels = in_dims[4];
-      in_depth = in_dims[1];
-      in_height = in_dims[2];
-      in_width = in_dims[3];
-      out_depth = out_dims[1];
-      out_height = out_dims[2];
-      out_width = out_dims[3];
-    }
-
-    if (mode == "reflect") {
-      PADDLE_ENFORCE_GT(in_depth, pads[4],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_front"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_front(%d).",
-                            in_depth, pads[4]));
-      PADDLE_ENFORCE_GT(in_depth, pads[5],
-                        platform::errors::InvalidArgument(
-                            "The depth of Input(X)'s dimension should be "
-                            "greater than pad_back"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_back(%d).",
-                            in_depth, pads[5]));
-
-      PADDLE_ENFORCE_GT(in_height, pads[2],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_top"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_top(%d).",
-                            in_height, pads[2]));
-      PADDLE_ENFORCE_GT(in_height, pads[3],
-                        platform::errors::InvalidArgument(
-                            "The height of Input(X)'s dimension should be "
-                            "greater than pad_bottom"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_bottom(%d).",
-                            in_height, pads[3]));
-
-      PADDLE_ENFORCE_GT(in_width, pads[0],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_left"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_left(%d).",
-                            in_width, pads[0]));
-      PADDLE_ENFORCE_GT(in_width, pads[1],
-                        platform::errors::InvalidArgument(
-                            "The width of Input(X)'s dimension should be "
-                            "greater than pad_right"
-                            " in reflect mode"
-                            ", but received depth(%d) and pad_right(%d).",
-                            in_width, pads[1]));
-    } else if (mode == "circular" || mode == "replicate") {
-      PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0,
-                        platform::errors::InvalidArgument(
-                            "The input tensor size can not be 0 for circular "
-                            "or replicate padding mode."));
-    }
-
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-    const int num = in_dims[0];
-
-    auto stream = context.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    const int out_size = out->numel();
-    int grid = (out_size + block - 1) / block;
-
-    if (data_format == "NCDHW") {
-      if (mode == "reflect") {
-        Pad3DReflectNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "replicate") {
-        Pad3DReplicateNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "circular") {
-        Pad3DCircularNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else {
-        Pad3DConstNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            value, out_data);
-      }
-    } else {
-      if (mode == "reflect") {
-        Pad3DReflectNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "replicate") {
-        Pad3DReplicateNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else if (mode == "circular") {
-        Pad3DCircularNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            out_data);
-      } else {
-        Pad3DConstNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            value, out_data);
-      }
-    }
-  }
-};
-
-template <typename T>
-class Pad3dGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    auto d_out_dims = d_out->dims();
-    const T* d_out_data = d_out->data<T>();
-    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(context.template device_context<platform::CUDADeviceContext>(),
-             d_in, static_cast<T>(0));
-
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-
-    const int num = d_in_dims[0];
-
-    auto stream = context.cuda_device_context().stream();
-    int block = PADDLE_CUDA_NUM_THREADS;
-    const int out_size = d_out->numel();
-    const int in_size = d_in->numel();
-    int grid = (out_size + block - 1) / block;
-
-    if (data_format == "NCDHW") {
-      const int channels = d_in_dims[1];
-      const int in_depth = d_in_dims[2];
-      const int in_height = d_in_dims[3];
-      const int in_width = d_in_dims[4];
-      const int out_depth = d_out_dims[2];
-      const int out_height = d_out_dims[3];
-      const int out_width = d_out_dims[4];
-
-      if (mode == "reflect") {
-        Pad3DGradReflectNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "replicate") {
-        Pad3DGradReplicateNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "circular") {
-        Pad3DGradCircularNCDHW<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else {
-        grid = (in_size + block - 1) / block;
-        Pad3DGradConstNCDHW<T><<<grid, block, 0, stream>>>(
-            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      }
-    } else {
-      const int channels = d_in_dims[4];
-      const int in_depth = d_in_dims[1];
-      const int in_height = d_in_dims[2];
-      const int in_width = d_in_dims[3];
-      const int out_depth = d_out_dims[1];
-      const int out_height = d_out_dims[2];
-      const int out_width = d_out_dims[3];
-      if (mode == "reflect") {
-        Pad3DGradReflectNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "replicate") {
-        Pad3DGradReplicateNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else if (mode == "circular") {
-        Pad3DGradCircularNDHWC<T><<<grid, block, 0, stream>>>(
-            out_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      } else {
-        grid = (in_size + block - 1) / block;
-        Pad3DGradConstNDHWC<T><<<grid, block, 0, stream>>>(
-            in_size, d_in_data, num, channels, in_depth, in_height, in_width,
-            out_depth, out_height, out_width, pad_front, pad_top, pad_left,
-            d_out_data);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(pad3d, ops::Pad3dCUDAKernel<plat::float16>,
-                        ops::Pad3dCUDAKernel<float>,
-                        ops::Pad3dCUDAKernel<double>, ops::Pad3dCUDAKernel<int>,
-                        ops::Pad3dCUDAKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL(pad3d_grad, ops::Pad3dGradCUDAKernel<plat::float16>,
-                        ops::Pad3dGradCUDAKernel<float>,
-                        ops::Pad3dGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc
index d0cb674b4049f..adc4a2ffaf8c5 100644
--- a/paddle/fluid/operators/pad_op_npu.cc
+++ b/paddle/fluid/operators/pad_op_npu.cc
@@ -90,5 +90,5 @@ namespace plat = paddle::platform;
 REGISTER_OP_NPU_KERNEL(pad, ops::PadNPUKernel<plat::float16>,
                        ops::PadNPUKernel<float>, ops::PadNPUKernel<int>);
 
-REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadNPUKernel<plat::float16>,
+REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadGradNPUKernel<plat::float16>,
                        ops::PadGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 9bd6ae8bab829..de35f67405810 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -9,14 +9,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/prelu_op.h"
-
 #include <memory>
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
+framework::OpKernelType innerGetKernelTypeForVar(
+    const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) {
+#ifdef PADDLE_WITH_MKLDNN
+  auto isOneDNNKernelChosen =
+      (expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN);
+  auto isNotOneDNNTensor = (tensor.layout() != framework::DataLayout::kMKLDNN);
+  auto isModelNHWC =
+      (paddle::platform::MKLDNNDeviceContext::tls()
+           .get_cur_paddle_data_layout() == framework::DataLayout::kNHWC);
+  // All inputs (including alpha) need shape rotating
+  if (isOneDNNKernelChosen && isNotOneDNNTensor && isModelNHWC) {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(),
+                                   framework::DataLayout::kNHWC);
+  }
+#endif
+  return framework::OpKernelType(expected_kernel_type.data_type_,
+                                 tensor.place(), tensor.layout());
+}
+
 class PReluOp : public framework::OperatorWithKernel {
  public:
   PReluOp(const std::string &type, const framework::VariableNameMap &inputs,
@@ -24,95 +49,6 @@ class PReluOp : public framework::OperatorWithKernel {
           const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "prelu");
-    OP_INOUT_CHECK(ctx->HasInput("Alpha"), "Input", "Alpha", "prelu");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "prelu");
-
-    auto x_dim = ctx->GetInputDim("X");
-    std::string mode = ctx->Attrs().Get<std::string>("mode");
-    if (mode == "all") {
-      PADDLE_ENFORCE_EQ(phi::product(ctx->GetInputDim("Alpha")), 1,
-                        platform::errors::InvalidArgument(
-                            "For mode 'all', size of weight Alpha must be one. "
-                            "But recevied alpha's size: %d.",
-                            product(ctx->GetInputDim("Alpha"))));
-    } else if (mode == "channel") {
-      auto x_rank = x_dim.size();
-      PADDLE_ENFORCE_GE(x_rank, 2,
-                        platform::errors::InvalidArgument(
-                            "For mode 'channel', rank of input X must be "
-                            "equal or larger than 2. But recevied X's "
-                            "rank: %d",
-                            x_rank));
-      const std::string data_format_str =
-          ctx->Attrs().Get<std::string>("data_format");
-      PADDLE_ENFORCE_EQ(data_format_str == "NCHW" || data_format_str == "NHWC",
-                        true,
-                        platform::errors::InvalidArgument(
-                            "For mode 'channel', data_format must be one of "
-                            "NCHW and NHWC. But recevied data_format: %s",
-                            data_format_str));
-      if (data_format_str == "NCHW") {
-        PADDLE_ENFORCE_EQ(
-            product(ctx->GetInputDim("Alpha")) == x_dim[1], true,
-            platform::errors::InvalidArgument(
-                "For mode 'channel', size of weight Alpha must be "
-                "equal to the number of channels of input(x). But "
-                "recevied alpha's size: %d, x_dim[1]: %d",
-                product(ctx->GetInputDim("Alpha")), x_dim[1]));
-      } else {
-        PADDLE_ENFORCE_EQ(
-            product(ctx->GetInputDim("Alpha")) == x_dim[x_rank - 1], true,
-            platform::errors::InvalidArgument(
-                "For mode 'channel', size of weight Alpha must be "
-                "equal to the number of channels of input(x). But "
-                "recevied alpha's size: %d, x_dim[%d]: %d",
-                product(ctx->GetInputDim("Alpha")), x_rank - 1,
-                x_dim[x_rank - 1]));
-      }
-
-    } else if (mode == "element") {
-      auto alpha_dim = ctx->GetInputDim("Alpha");
-      auto alpha_rank = alpha_dim.size();
-      auto x_rank = x_dim.size();
-      PADDLE_ENFORCE_GE(x_rank, 1,
-                        platform::errors::InvalidArgument(
-                            "For mode 'element', rank of input X must be "
-                            "equal or larger than 2. But recevied X's "
-                            "rank: %d",
-                            x_rank));
-      PADDLE_ENFORCE_EQ(
-          alpha_rank, x_rank,
-          platform::errors::InvalidArgument(
-              "For mode 'element', rank of weight Alpha must be ",
-              "equal to the rank of input(x). But recevied alpha's rank: %d, "
-              "x's rank: %d.",
-              alpha_rank, x_rank));
-      size_t x_product = 1;
-      size_t alpha_product = 1;
-      for (int64_t i = x_rank - 1; i > 0; i--) {
-        x_product *= x_dim[i];
-        alpha_product *= alpha_dim[i];
-      }
-      PADDLE_ENFORCE_EQ(
-          alpha_product, x_product,
-          platform::errors::InvalidArgument(
-              "For mode 'element', the size of weight Alpha must be "
-              "equal to the size of input(x). But recevied alpha's size: %d, "
-              "x's size: %d.",
-              alpha_product, x_product));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. "
-          "But recevied "
-          "mode: '%s'.",
-          mode));
-    }
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -128,6 +64,12 @@ class PReluOp : public framework::OperatorWithKernel {
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    return innerGetKernelTypeForVar(tensor, expected_kernel_type);
+  }
 };
 
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -212,6 +154,12 @@ class PReluGradOp : public framework::OperatorWithKernel {
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    return innerGetKernelTypeForVar(tensor, expected_kernel_type);
+  }
 };
 
 template <typename T>
@@ -236,13 +184,10 @@ class PReluGradOpMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(prelu, PReluInferShapeFunctor,
+                            PD_INFER_META(phi::PReluInferMeta));
 REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker,
                   ops::PReluGradOpMaker<paddle::framework::OpDesc>,
-                  ops::PReluGradOpMaker<paddle::imperative::OpBase>);
+                  ops::PReluGradOpMaker<paddle::imperative::OpBase>,
+                  PReluInferShapeFunctor);
 REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp);
-REGISTER_OP_CPU_KERNEL(
-    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PReluKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    prelu_grad, ops::PReluGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PReluGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
deleted file mode 100644
index 12e55d042d703..0000000000000
--- a/paddle/fluid/operators/prelu_op.cu
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/prelu.h"
-#include "paddle/fluid/operators/prelu_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-#define CUDA_NUM_THREADS 1024
-
-inline static int PADDLE_GET_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-template <typename DeviceContext, typename T>
-class CUDAPReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* out = context.Output<Tensor>("Out");
-
-    const T* x_ptr = x->data<T>();
-    T* o_ptr = out->mutable_data<T>(context.GetPlace());
-
-    const T* alpha_ptr = alpha->data<T>();
-    auto& mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    auto x_rank = dim.size();
-
-    VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim["
-            << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel;
-
-    if (mode == "channel") {
-      bool channel_last = data_format == "NHWC";
-      size_t channel = channel_last ? dim[x_rank - 1] : dim[1];
-      math::PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
-      prelu_channel_wise(context.cuda_device_context().stream(), x_ptr,
-                         alpha_ptr, o_ptr, dim[0], channel, channel_last,
-                         numel);
-    } else if (mode == "element") {
-      math::PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
-      prelu_element_wise(context.cuda_device_context().stream(), x_ptr,
-                         alpha_ptr, o_ptr, dim[0], numel);
-    } else {
-      math::PreluScalarDirectCUDAFunctor<T> prelu_scalar;
-      prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr,
-                   o_ptr, numel);
-    }
-  }
-};
-
-enum PRELU_MODE { Element, ChannelFirst, ChannelLast, Scalar };
-
-template <typename T>
-__global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr,
-                                  const T* dy_ptr, T* dx_ptr, T* dalpha_ptr,
-                                  size_t channel_num, size_t plane_size,
-                                  size_t spatial_size, size_t numel,
-                                  PRELU_MODE mode) {
-  CUDA_KERNEL_LOOP(index, numel) {
-    T scale;
-    if (mode == Element) {
-      size_t element_index = index % spatial_size;
-      scale = alpha_ptr[element_index];
-    } else if (mode == ChannelFirst) {
-      size_t temp = index / plane_size;
-      size_t channel_index = temp % channel_num;
-      scale = alpha_ptr[channel_index];
-    } else if (mode == ChannelLast) {
-      size_t channel_index = index % channel_num;
-      scale = alpha_ptr[channel_index];
-    } else {
-      scale = alpha_ptr[0];
-    }
-    T x = x_ptr[index];
-    T dy = dy_ptr[index];
-    T zero = static_cast<T>(0);
-    if (dx_ptr != nullptr) dx_ptr[index] = (x > zero) ? dy : scale * dy;
-    if (dalpha_ptr != nullptr) dalpha_ptr[index] = (x > zero) ? zero : x * dy;
-  }
-}
-
-template <typename T>
-class PreluOpGradFunctor {
- public:
-  void operator()(gpuStream_t stream, const T* x, const T* alpha, const T* dy,
-                  T* dx, T* dalpha, const framework::DDim& input_dims,
-                  PRELU_MODE mode) {
-    size_t numel = 1;
-    for (size_t i = 0; i < input_dims.size(); ++i) {
-      numel *= input_dims[i];
-    }
-    size_t plane_size = numel / input_dims[0] / input_dims[1];
-    size_t spatial_size = numel / input_dims[0];
-    size_t channel =
-        mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1];
-
-    PReluOpGradKernel<
-        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
-        x, alpha, dy, dx, dalpha, channel, plane_size, spatial_size, numel,
-        mode);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CUDAPReluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
-
-    const T* x_ptr = x->data<T>();
-    const T* alpha_ptr = alpha->data<T>();
-    const T* dy_ptr = dy->data<T>();
-    T* dx_ptr = dx ? dx->mutable_data<T>(context.GetPlace()) : nullptr;
-    T* dalpha_ptr =
-        dalpha ? dalpha->mutable_data<T>(context.GetPlace()) : nullptr;
-
-    if (!dx && !dalpha) return;
-
-    auto& mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    auto x_rank = dim.size();
-    std::vector<int> input_shape = phi::vectorize<int>(dim);
-    auto stream = context.cuda_device_context().stream();
-
-    T* dalpha_tmp_ptr;
-    Tensor dalpha_tmp;
-    if (dalpha_ptr == nullptr) {
-      dalpha_tmp_ptr = dalpha_ptr;
-    } else {
-      auto& dev_ctx = context.template device_context<DeviceContext>();
-      dalpha_tmp = context.AllocateTmpTensor<T, DeviceContext>(dim, dev_ctx);
-      dalpha_tmp_ptr = dalpha_tmp.mutable_data<T>(context.GetPlace());
-    }
-
-    PRELU_MODE m;
-    bool channel_last = false;
-    if (mode == "element") {
-      m = Element;
-    } else if (mode == "channel") {
-      channel_last = data_format == "NHWC";
-      m = channel_last ? ChannelLast : ChannelFirst;
-    } else {
-      m = Scalar;
-    }
-    PreluOpGradFunctor<T> prelu_grad;
-    prelu_grad(stream, x_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr, dim,
-               m);
-
-    if (dalpha_tmp_ptr == nullptr) return;
-
-    std::vector<int> reduce_dims;
-    for (size_t i = 0; i < dim.size(); i++) {
-      if (mode == "channel" && !channel_last && i == 1) continue;
-      if (mode == "channel" && channel_last && i == dim.size() - 1) continue;
-      if (mode == "element" && i != 0) continue;
-      reduce_dims.push_back(i);
-    }
-
-    TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        context.cuda_device_context(), dalpha_tmp, dalpha,
-        kps::IdentityFunctor<T>(), reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    prelu, ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    prelu_grad,
-    ops::CUDAPReluGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CUDAPReluGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>,
-    ops::CUDAPReluGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h
deleted file mode 100644
index 384994eb37c2a..0000000000000
--- a/paddle/fluid/operators/prelu_op.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/transform.h"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using platform::Transform;
-
-template <typename DeviceContext, typename T>
-class PReluKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* out = context.Output<Tensor>("Out");
-
-    const T* x_ptr = x->data<T>();
-    T* o_ptr = out->mutable_data<T>(context.GetPlace());
-
-    const T* alpha_ptr = alpha->data<T>();
-    auto& mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-
-    int numel = x->numel();
-    auto dim = x->dims();
-    int index = 0;
-    int i = 0;
-    if (mode == "channel") {
-      if (data_format == "NCHW") {
-        int temp = 1;
-        for (int j = 2; j < dim.size(); j++) {
-          temp *= dim[j];
-        }
-        for (i = 0; i < numel; i++) {
-          index = (i / temp) % dim[1];
-          o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-        }
-      } else {
-        for (i = 0; i < numel; i++) {
-          index = i % dim[dim.size() - 1];
-          o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-        }
-      }
-    } else if (mode == "element") {
-      int temp = 1;
-      for (int j = 1; j < dim.size(); j++) {
-        temp *= dim[j];
-      }
-      for (i = 0; i < numel; i++) {
-        index = i % temp;
-        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-      }
-    } else {
-      for (i = 0; i < numel; i++) {
-        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PReluGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
-    auto* alpha = context.Input<Tensor>("Alpha");
-    const T* alpha_ptr = alpha->data<T>();
-    const T* x_ptr = x->data<T>();
-    const T* dout_ptr = dout->data<T>();
-    std::string mode = context.Attr<std::string>("mode");
-    auto& data_format = context.Attr<std::string>("data_format");
-    int numel = x->numel();
-    auto dim = x->dims();
-    int index = 0;
-    int i = 0;
-    if (dx) {
-      T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
-      if (mode == "channel") {
-        if (data_format == "NCHW") {
-          int temp = 1;
-          for (int j = 2; j < dim.size(); j++) {
-            temp *= dim[j];
-          }
-          for (i = 0; i < numel; i++) {
-            index = (i / temp) % dim[1];
-            dx_ptr[i] =
-                x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
-          }
-        } else {
-          for (i = 0; i < numel; i++) {
-            index = i % dim[dim.size() - 1];
-            dx_ptr[i] =
-                x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
-          }
-        }
-      } else if (mode == "element") {
-        int temp = 1;
-        for (int j = 1; j < dim.size(); j++) {
-          temp *= dim[j];
-        }
-        for (i = 0; i < numel; i++) {
-          index = i % temp;
-          dx_ptr[i] =
-              x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
-        }
-      } else {
-        for (i = 0; i < numel; i++) {
-          dx_ptr[i] = x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i];
-        }
-      }
-    }
-
-    index = 0;
-    if (dalpha) {
-      T* dalpha_ptr = dalpha->mutable_data<T>(context.GetPlace());
-      memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel());
-
-      if (mode == "channel") {
-        if (data_format == "NCHW") {
-          int temp = 1;
-          for (int j = 2; j < dim.size(); j++) {
-            temp *= dim[j];
-          }
-          for (i = 0; i < numel; i++) {
-            index = (i / temp) % dim[1];
-            dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-          }
-        } else {
-          for (i = 0; i < numel; i++) {
-            index = i % dim[dim.size() - 1];
-            dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-          }
-        }
-      } else if (mode == "element") {
-        int temp = 1;
-        for (int j = 1; j < dim.size(); j++) {
-          temp *= dim[j];
-        }
-        for (i = 0; i < numel; i++) {
-          index = i % temp;
-          dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-        }
-      } else {
-        for (i = 0; i < numel; i++) {
-          dalpha_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
-        }
-      }
-    }
-
-    // TODO(Guanzhong): add GPU kernels
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc
index 40e3cbde3b009..82fc9ef1b7858 100644
--- a/paddle/fluid/operators/qr_op.cc
+++ b/paddle/fluid/operators/qr_op.cc
@@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker,
 
 REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
 
-REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel<float>, ops::QrCPUKernel<double>);
-
 REGISTER_OP_CPU_KERNEL(
     qr_grad, ops::QrGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::QrGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index f09a07e96cd34..5ef02d8942797 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -48,85 +48,6 @@ static inline std::tuple<bool, bool> _parse_qr_mode(std::string mode) {
   return std::make_tuple(compute_q, reduced);
 }
 
-template <typename T>
-class QrCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool compute_q;
-    bool reduced_mode;
-    const Tensor& x = *context.Input<Tensor>("X");
-    Tensor& q = *context.Output<Tensor>("Q");
-    Tensor& r = *context.Output<Tensor>("R");
-    std::string mode = context.Attr<std::string>("mode");
-    std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
-
-    auto numel = x.numel();
-    PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet(
-                                    "The input of QR is empty."));
-    auto x_dims = x.dims();
-    int x_rank = x_dims.size();
-    int m = x_dims[x_rank - 2];
-    int n = x_dims[x_rank - 1];
-    int min_mn = std::min(m, n);
-    int k = reduced_mode ? min_mn : m;
-    int batch_size = numel / (m * n);
-    int x_stride = m * n;
-    int q_stride = m * k;
-    int r_stride = k * n;
-
-    auto* x_data = x.data<phi::dtype::Real<T>>();
-    T* q_data = nullptr;
-    if (compute_q) {
-      q_data = q.mutable_data<phi::dtype::Real<T>>(
-          context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
-      memset(q_data, 0,
-             size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
-    }
-    auto* r_data = r.mutable_data<phi::dtype::Real<T>>(
-        context.GetPlace(),
-        size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
-    memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
-
-    // Implement QR by calling Eigen
-    for (int i = 0; i < batch_size; ++i) {
-      const T* x_matrix_ptr = x_data + i * x_stride;
-      T* r_matrix_ptr = r_data + i * r_stride;
-      using EigenDynamicMatrix =
-          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-      auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
-      Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
-      if (reduced_mode) {
-        auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
-        auto r_matrix_view =
-            qr_top_matrix.template triangularView<Eigen::Upper>();
-        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
-        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
-      } else {
-        auto r_matrix_view =
-            qr.matrixQR().template triangularView<Eigen::Upper>();
-        auto r_matrix = EigenDynamicMatrix(r_matrix_view);
-        memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
-      }
-
-      if (compute_q) {
-        T* q_matrix_ptr = q_data + i * q_stride;
-        if (reduced_mode) {
-          auto q_matrix =
-              qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
-          q_matrix.transposeInPlace();
-          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
-        } else {
-          auto q_matrix =
-              qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
-          q_matrix.transposeInPlace();
-          memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
-        }
-      }
-    }
-  }
-};
-
 template <typename DeviceContext, typename T>
 class QrGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
index 0a5d54e72c845..83a21a919dcaa 100644
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
@@ -12,9 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h"
-
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace framework {
@@ -56,22 +59,12 @@ class FrobeniusNormOpMaker : public ops::ReduceOpMaker {
   virtual std::string GetOpType() const { return "Reduce frobenius_norm"; }
 };
 
+DECLARE_INFER_SHAPE_FUNCTOR(frobenius_norm, FrobeniusNormInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
 REGISTER_OPERATOR(frobenius_norm, ops::ReduceOp, FrobeniusNormOpMaker,
                   ops::FrobeniusNormOpGradMaker<paddle::framework::OpDesc>,
-                  ops::FrobeniusNormOpGradMaker<paddle::imperative::OpBase>);
+                  ops::FrobeniusNormOpGradMaker<paddle::imperative::OpBase>,
+                  FrobeniusNormInferShapeFunctor);
 
 REGISTER_OPERATOR(frobenius_norm_grad, ops::ReduceGradOp);
-
-REGISTER_OP_CPU_KERNEL(frobenius_norm,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::FrobeniusNormFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::FrobeniusNormFunctor>);
-
-template <typename T>
-using CPUFrobeniusNormGradKernel =
-    ops::FrobeniusNormGradKernel<paddle::platform::CPUDeviceContext, T,
-                                 ops::FrobeniusNormGradFunctor>;
-
-REGISTER_OP_CPU_KERNEL(frobenius_norm_grad, CPUFrobeniusNormGradKernel<float>,
-                       CPUFrobeniusNormGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu
deleted file mode 100644
index b2cef09df9436..0000000000000
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/frobenius_norm_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-
-template <typename T>
-using CUDAFrobeniusNormKernel =
-    ops::ReduceKernel<paddle::platform::CUDADeviceContext, T,
-                      ops::FrobeniusNormFunctor>;
-
-REGISTER_OP_CUDA_KERNEL(frobenius_norm, CUDAFrobeniusNormKernel<float>,
-                        CUDAFrobeniusNormKernel<double>);
-
-template <typename T>
-using CUDAFrobeniusNormGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T,
-                          ops::FrobeniusNormGradFunctor>;
-
-REGISTER_OP_CUDA_KERNEL(frobenius_norm_grad, CUDAFrobeniusNormGradKernel<float>,
-                        CUDAFrobeniusNormGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h
deleted file mode 100644
index 0b6b87d99ecd9..0000000000000
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-// \partial \| X \|_F = \frac{X}{ \| X \|_F }
-template <typename DeviceContext, typename T, typename Functor>
-class FrobeniusNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // default use Eigen broadcast
-    ReduceGradKernel<DeviceContext, T, Functor, false> kernel;
-    kernel.Compute(context);
-  }
-};
-
-struct FrobeniusNormFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = ((x->square()).sum(dim)).sqrt();
-  }
-};
-
-struct FrobeniusNormGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = y->broadcast(dim);
-    dx->device(place) = *dx + dx->constant(1e-12f);
-    dx->device(place) = (*x / *dx) * (dy->broadcast(dim));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
index 955cf8d4448c1..9115d21b195e1 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -28,9 +32,17 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_all, ReduceAllInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+class ReduceAllOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_all"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_all"; }
+};
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
-REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all, UseInputPlace);
-REGISTER_OP_CPU_KERNEL(reduce_all,
-                       ops::BoolReduceKernel<paddle::platform::CPUDeviceContext,
-                                             bool, ops::AllFunctor>);
+REGISTER_OPERATOR(
+    reduce_all, ops::ReduceOpUseInputPlace, ReduceAllOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ReduceAllInferShapeFunctor);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
index fa3800dd3c9e4..69561b9349888 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -28,9 +31,18 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_any, ReduceAnyInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+class ReduceAnyOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_any"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_any"; }
+};
 // kernel's device type is decided by input tensor place, to be consistent with
 // compare and logical ops
-REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any, UseInputPlace);
-REGISTER_OP_CPU_KERNEL(reduce_any,
-                       ops::BoolReduceKernel<paddle::platform::CPUDeviceContext,
-                                             bool, ops::AnyFunctor>);
+REGISTER_OPERATOR(
+    reduce_any, ops::ReduceOpUseInputPlace, ReduceAnyOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ReduceAnyInferShapeFunctor);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
index d057ee8f5d798..e327d19ab3be8 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -35,7 +35,7 @@ namespace p = paddle::platform;
 
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(reduce_any);
+USE_OP_ITSELF(reduce_any);
 USE_OP_DEVICE_KERNEL(reduce_any, NPU);
 
 template <typename T>
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
index 41df8e4a15f09..15812778e0023 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc
@@ -35,13 +35,3 @@ REGISTER_OPERATOR(
     paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
     ReduceMaxInferShapeFunctor);
 REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp)
-
-REGISTER_OP_CPU_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu
deleted file mode 100644
index 5ee38b8fa4629..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index 4a18330913803..dc41979defb93 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -107,12 +107,3 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
                   ops::ReduceMeanDoubleGradDescMaker,
                   ops::ReduceMeanDoubleGradOpBaseMaker,
                   ops::ReduceMeanGradNoNeedBufferVarInferer);
-
-template <typename T>
-using CPUReduceMeanGradKernel =
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, T,
-                          ops::MeanGradFunctor, true>;
-
-REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel<bool>,
-                       CPUReduceMeanGradKernel<float>,
-                       CPUReduceMeanGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
index 11aa78382e319..5e5b04d57b002 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc
@@ -14,21 +14,24 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
 
-REGISTER_REDUCE_OP(reduce_min);
-REGISTER_OP_CPU_KERNEL(
-    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                                  ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::MinFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace ops = paddle::operators;
+
+class ReduceMinOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_min"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_min"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_min, ReduceMinInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
+
+REGISTER_OPERATOR(
+    reduce_min, ops::ReduceOp, ReduceMinOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceMinInferShapeFunctor);
+REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp)
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu
deleted file mode 100644
index bf886063786a8..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 65cca94814e88..ff1ddb4175fef 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -265,67 +265,6 @@ class ReduceKernel : public framework::OpKernel<T> {
         framework::TransToPhiDataType(cast_out_dtype), output);
   }
 };
-template <typename DeviceContext, typename OutT, typename Functor>
-class BoolReduceKernel : public framework::OpKernel<OutT> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<OutT>(context.GetPlace());
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    // The dims has full dim, set the reduce_all is True
-    const auto& input_dim_size = context.Input<Tensor>("X")->dims().size();
-    std::set<int> dims_set(dims.begin(), dims.end());
-    bool full_dim = true;
-    for (auto i = 0; i < input_dim_size; i++) {
-      if (dims_set.find(i) == dims_set.end()) {
-        full_dim = false;
-        break;
-      }
-    }
-    reduce_all = (reduce_all || full_dim);
-
-    if (reduce_all) {
-      // Flatten and reduce 1-D tensor
-      auto x = EigenVector<OutT>::Flatten(*input);
-      auto out = EigenScalar<OutT>::From(*output);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      auto reduce_dim = Eigen::array<int, 1>({{0}});
-      Functor functor;
-      functor(place, &x, &out, reduce_dim);
-    } else {
-      int ndim = input->dims().size();
-      int rdim = dims.size();
-      // comments for accelerating compiling temporarily.
-      if (ndim > 6) {
-        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
-                                                     dims, keep_dim);
-      } else {
-        HANDLE_DIM(6, 5);
-        HANDLE_DIM(6, 4);
-        HANDLE_DIM(6, 3);
-        HANDLE_DIM(6, 2);
-        HANDLE_DIM(6, 1);
-        HANDLE_DIM(5, 4);
-        HANDLE_DIM(5, 3);
-        HANDLE_DIM(5, 2);
-        HANDLE_DIM(5, 1);
-        HANDLE_DIM(4, 3);
-        HANDLE_DIM(4, 2);
-        HANDLE_DIM(4, 1);
-        HANDLE_DIM(3, 2);
-        HANDLE_DIM(3, 1);
-        HANDLE_DIM(2, 1);
-        HANDLE_DIM(1, 1);
-      }
-    }
-  }
-};
 
 template <typename DeviceContext, typename T, typename Functor>
 void LaunchReduceGradKernel(const framework::ExecutionContext& context,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
index eb745ab9c56c5..b1abdf9e8a758 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -26,14 +30,20 @@ class CPUDeviceContext;
 }  // namespace platform
 }  // namespace paddle
 
-REGISTER_REDUCE_OP(reduce_prod);
+namespace ops = paddle::operators;
+
+class ReduceProdOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_prod"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_prod"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_prod, ReduceProdInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
 
-REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             float, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             double, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int, ops::ProdGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int64_t, ops::ProdGradFunctor>);
+REGISTER_OPERATOR(
+    reduce_prod, ops::ReduceOp, ReduceProdOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceProdInferShapeFunctor);
+REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu
deleted file mode 100644
index 0610cdd94f89c..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc
index 98a1610be607e..975eecafc06a6 100644
--- a/paddle/fluid/operators/reverse_op.cc
+++ b/paddle/fluid/operators/reverse_op.cc
@@ -12,60 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reverse_op.h"
 #include <memory>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 
 class ReverseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Reverse");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Reverse");
-
-    auto x_var_type = ctx->GetInputsVarType("X")[0];
-    const auto& axis = ctx->Attrs().Get<std::vector<int>>("axis");
-    if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
-      PADDLE_ENFORCE_EQ(
-          axis.size(), 1,
-          platform::errors::InvalidArgument(
-              "The size of axis must be 1 when the Input(X) is LoDTensorArray, "
-              "but received %d.",
-              axis.size()));
-      PADDLE_ENFORCE_EQ(axis[0], 0, platform::errors::InvalidArgument(
-                                        "The value of axis should be 1 when "
-                                        "the Input(X) is LoDTensorArray, "
-                                        "but received %d.",
-                                        axis[0]));
-      // In runtime, shape is determined by RunImpl.
-      if (!ctx->IsRuntime()) {
-        const auto& x_dims = ctx->GetInputDim("X");
-        ctx->SetOutputDim("Out", x_dims);
-      }
-      return;
-    }
-    const auto& x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_NE(axis.empty(), true, platform::errors::InvalidArgument(
-                                              "'axis' can not be empty."));
-    for (int a : axis) {
-      PADDLE_ENFORCE_LT(a, x_dims.size(),
-                        paddle::platform::errors::OutOfRange(
-                            "The axis must be less than input tensor's rank. "
-                            "but got %d >= %d",
-                            a, x_dims.size()));
-      PADDLE_ENFORCE_GE(
-          a, -x_dims.size(),
-          paddle::platform::errors::OutOfRange(
-              "The axis must be greater than the negative number of "
-              "input tensor's rank, but got %d < %d",
-              a, -x_dims.size()));
-    }
-    ctx->SetOutputDim("Out", x_dims);
-  }
 };
 
 class ReverseOpVarTypeInference : public framework::VarTypeInference {
@@ -134,23 +94,10 @@ class ReverseGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(reverse, ReverseInferShapeFunctor,
+                            PD_INFER_META(phi::ReverseInferMeta));
 REGISTER_OPERATOR(reverse, ops::ReverseOp, ops::ReverseOpMaker,
                   ops::ReverseGradMaker<paddle::framework::OpDesc>,
                   ops::ReverseGradMaker<paddle::imperative::OpBase>,
-                  ops::ReverseOpVarTypeInference);
+                  ops::ReverseOpVarTypeInference, ReverseInferShapeFunctor);
 REGISTER_OPERATOR(reverse_grad, ops::ReverseOp, ops::ReverseOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(
-    reverse, ops::ReverseKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ReverseKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::ReverseKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ReverseKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::ReverseKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ReverseKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    reverse, ops::ReverseKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h
deleted file mode 100644
index d5e331e2fe5f6..0000000000000
--- a/paddle/fluid/operators/reverse_op.h
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T, int Rank>
-struct ReverseFunctor {
-  void operator()(const DeviceContext& context, const framework::LoDTensor& in,
-                  framework::LoDTensor* out, const std::vector<int>& axis) {
-    Eigen::DSizes<bool, Rank> reverse_axis;
-    for (int i = 0; i < Rank; ++i) {
-      reverse_axis[i] = false;
-    }
-    for (int a : axis) {
-      if (a >= 0) {
-        reverse_axis[a] = true;
-      } else {
-        reverse_axis[Rank + a] = true;
-      }
-    }
-
-    auto in_eigen = framework::EigenTensor<T, Rank>::From(in);
-    auto out_eigen = framework::EigenTensor<T, Rank>::From(*out);
-    auto& dev = *context.eigen_device();
-
-    EigenReverse<std::decay_t<decltype(dev)>, T, Rank>::Eval(
-        dev, out_eigen, in_eigen, reverse_axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReverseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_var = context.InputVar("X");
-    const auto& axis = context.Attr<std::vector<int>>("axis");
-    if (x_var->IsType<framework::LoDTensorArray>()) {
-      auto& x_array = x_var->Get<framework::LoDTensorArray>();
-      auto* out_array = context.Output<framework::LoDTensorArray>("Out");
-
-      out_array->resize(x_array.size());
-      for (size_t offset = 0; offset < x_array.size(); offset++) {
-        auto& x_tensor = x_array.at(offset);
-        PADDLE_ENFORCE_GT(
-            x_tensor.memory_size(), 0,
-            platform::errors::PreconditionNotMet(
-                "The input LoDTensorArray X[%d] holds no memory.", offset));
-        auto out_offset = x_array.size() - offset - 1;
-        auto* out_tensor = &out_array->at(out_offset);
-
-        out_tensor->set_lod(x_tensor.lod());
-        paddle::framework::TensorCopy(x_tensor, context.GetPlace(), out_tensor);
-      }
-      return;
-    }
-    auto* x = context.Input<framework::LoDTensor>("X");
-    auto* out = context.Output<framework::LoDTensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    int rank = x->dims().size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    switch (rank) {
-      case 1:
-        ReverseFunctor<DeviceContext, T, 1> functor1;
-        functor1(dev_ctx, *x, out, axis);
-        break;
-      case 2:
-        ReverseFunctor<DeviceContext, T, 2> functor2;
-        functor2(dev_ctx, *x, out, axis);
-        break;
-      case 3:
-        ReverseFunctor<DeviceContext, T, 3> functor3;
-        functor3(dev_ctx, *x, out, axis);
-        break;
-      case 4:
-        ReverseFunctor<DeviceContext, T, 4> functor4;
-        functor4(dev_ctx, *x, out, axis);
-        break;
-      case 5:
-        ReverseFunctor<DeviceContext, T, 5> functor5;
-        functor5(dev_ctx, *x, out, axis);
-        break;
-      case 6:
-        ReverseFunctor<DeviceContext, T, 6> functor6;
-        functor6(dev_ctx, *x, out, axis);
-        break;
-      default:
-        PADDLE_THROW(paddle::platform::errors::OutOfRange(
-            "The reserve operator does not support input tensors"
-            "whose ranks are greater than 6."));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index ac0cd75237baf..bf78b6a696559 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::NotFound("Input(X) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      platform::errors::NotFound("Input(ROIs) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound("Output(Out) of ROIAlignOp "
-                                                 "is not found."));
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(
-          rois_num_dims.size(), 1,
-          platform::errors::InvalidArgument("The size of RoisNum should be 1"
-                                            ", but received size = %d",
-                                            rois_num_dims.size()));
-    }
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "The format of Input(X) in"
-            "RoIAlignOp is NCHW. And the rank of input must be 4. "
-            "But received rank = %d",
-            input_dims.size()));
-    PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument(
-                                               "The rank of Input(ROIs) "
-                                               "in RoIAlignOp should be 2. "
-                                               "But the rank of RoIs is %d",
-                                               rois_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(rois_dims[1], 4,
-                        platform::errors::InvalidArgument(
-                            "The second dimension "
-                            "of Input(ROIs) should be 4. But received the "
-                            "dimension = %d",
-                            rois_dims[1]));
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_height' attribute in RoIAlignOp is "
-                          "invalid. The height must be greater than 0. But "
-                          "received 'pooled_height' = %d",
-                          pooled_height));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_width' attribute in RoIAlignOp is "
-                          "invalid. The width must be greater than 0. But "
-                          "received 'pooled_width' = %d",
-                          pooled_width));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The 'spatial_scale' attribute in RoIAlignOp is "
-                          "invalid. The scale must be greater than 0. But "
-                          "received 'spatial_scale' = %f",
-                          spatial_scale));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -221,17 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor,
+                            PD_INFER_META(phi::RoiAlignInferMeta));
+
 REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
                   ops::ROIAlignGradMaker<paddle::framework::OpDesc>,
-                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>);
+                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>,
+                  RoiAlignInferShapeFunctor);
 REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp,
                   ops::RoiAlignGradNoNeedBufVarsInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    roi_align_grad,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
 REGISTER_OP_VERSION(roi_align)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
deleted file mode 100644
index 1a2e64cd45ca4..0000000000000
--- a/paddle/fluid/operators/roi_align_op.cu
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_align_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-static constexpr int kROISize = 4;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <class T>
-__device__ void BilinearInterpolateGradient(const int height, const int width,
-                                            T y, T x, T* w1, T* w2, T* w3,
-                                            T* w4, int* x_low, int* x_high,
-                                            int* y_low, int* y_high) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    return;
-  }
-
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  *y_low = static_cast<int>(y);
-  *x_low = static_cast<int>(x);
-  if (*y_low >= height - 1) {
-    *y_high = *y_low = height - 1;
-    y = static_cast<T>(*y_low);
-  } else {
-    *y_high = *y_low + 1;
-  }
-  if (*x_low >= width - 1) {
-    *x_high = *x_low = width - 1;
-    x = static_cast<T>(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-  T ly = y - *y_low, lx = x - *x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
-
-  return;
-}
-
-template <typename T>
-__global__ void GPUROIAlignBackward(
-    const int nthreads, const T* input_rois, const T* out_grad,
-    const int num_rois, const float spatial_scale, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int sampling_ratio, int* roi_batch_id_data,
-    T* input_grad, const bool continuous_coordinate) {
-  CUDA_KERNEL_LOOP(i, nthreads) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-
-    T roi_offset = continuous_coordinate ? T(0.5) : 0;
-    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
-    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
-    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
-    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
-
-    T roi_width = roi_xmax - roi_xmin;
-    T roi_height = roi_ymax - roi_ymin;
-    if (!continuous_coordinate) {
-      roi_width = max(roi_width, static_cast<T>(1.));
-      roi_height = max(roi_height, static_cast<T>(1.));
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T* offset_input_grad =
-        input_grad + (roi_batch_ind * channels + c) * height * width;
-
-    const T* offset_out_grad =
-        out_grad + (n * channels + c) * pooled_height * pooled_width;
-    const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw];
-
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    const T count = roi_bin_grid_h * roi_bin_grid_w;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T y = roi_ymin + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_xmin + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-        T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
-        int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
-        BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4,
-                                    &x_low, &x_high, &y_low, &y_high);
-        T diff1 = out_grad_this_bin * w1 / count;
-        T diff2 = out_grad_this_bin * w2 / count;
-        T diff3 = out_grad_this_bin * w3 / count;
-        T diff4 = out_grad_this_bin * w4 / count;
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low,
-                                  diff1);
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high,
-                                  diff2);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low,
-                                  diff3);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high,
-                                  diff4);
-        }
-      }
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      int rois_batch_size = rois_num_t->numel();
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace, rois_num_list.data(), gplace,
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = start; i < start + rois_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    auto roi_ptr =
-        memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int));
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<Place, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-
-    int output_grad_size = out_grad->numel();
-    int blocks = NumBlocks(output_grad_size);
-    int threads = kNumCUDAThreads;
-
-    if (output_grad_size > 0) {
-      GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
-          spatial_scale, channels, height, width, pooled_height, pooled_width,
-          sampling_ratio, roi_id_data, in_grad->mutable_data<T>(ctx.GetPlace()),
-          aligned);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_align_grad,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
deleted file mode 100644
index 589e35e4ab7ae..0000000000000
--- a/paddle/fluid/operators/roi_align_op.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include <numeric>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <class T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   const T out_grad_this_bin, const T count,
-                                   T* batch_grad_data) {
-  int x_low, y_low, x_high, y_high;
-  T w1, w2, w3, w4;
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    w1 = w2 = w3 = w4 = 0;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  y_low = static_cast<int>(y);
-  x_low = static_cast<int>(x);
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low, lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-  T diff1 = out_grad_this_bin * w1 / count;
-  T diff2 = out_grad_this_bin * w2 / count;
-  T diff3 = out_grad_this_bin * w3 / count;
-  T diff4 = out_grad_this_bin * w4 / count;
-  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-    *(batch_grad_data + y_low * width + x_low) += diff1;
-    *(batch_grad_data + y_low * width + x_high) += diff2;
-    *(batch_grad_data + y_high * width + x_low) += diff3;
-    *(batch_grad_data + y_high * width + x_high) += diff4;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto in_dims = in->dims();
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-
-    int output_grad_size = out_grad->numel();
-
-    if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) {
-      return;
-    }
-
-    const T* rois_data = rois->data<T>();
-    const T* out_grad_data = out_grad->data<T>();
-    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto in_stride = phi::stride(in->dims());
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out_grad->dims());
-
-    T roi_offset = aligned ? T(0.5) : 0;
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_idx = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
-      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
-      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
-      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
-
-      T roi_width = roi_xmax - roi_xmin;
-      T roi_height = roi_ymax - roi_ymin;
-      roi_width = std::max(roi_width, static_cast<T>(1.));
-      roi_height = std::max(roi_height, static_cast<T>(1.));
-      if (!aligned) {
-        roi_width = std::max(roi_width, static_cast<T>(1.));
-        roi_height = std::max(roi_height, static_cast<T>(1.));
-      }
-
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-      for (int c = 0; c < channels; ++c) {
-        T* batch_grad_data =
-            in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1];
-        const T* batch_out_grad_data =
-            out_grad_data + n * out_stride[0] + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            int pool_index = ph * pooled_width + pw;
-            T out_grad_this_bin = batch_out_grad_data[pool_index];
-            int roi_bin_grid_h = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_height / pooled_height);
-            int roi_bin_grid_w = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_width / pooled_width);
-            T count = roi_bin_grid_h * roi_bin_grid_w;
-            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-              const T y = roi_ymin + ph * bin_size_h +
-                          static_cast<T>(iy + .5f) * bin_size_h /
-                              static_cast<T>(roi_bin_grid_h);
-              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-                const T x = roi_xmin + pw * bin_size_w +
-                            static_cast<T>(ix + .5f) * bin_size_w /
-                                static_cast<T>(roi_bin_grid_w);
-                bilinear_interpolate_gradient(height, width, y, x,
-                                              out_grad_this_bin, count,
-                                              batch_grad_data);
-              }
-            }
-          }
-        }
-      }
-      rois_data += roi_stride[0];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index a512e7dcd682b..12e33d56c0020 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/roi_pool_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 
 namespace paddle {
 namespace operators {
@@ -26,74 +29,6 @@ class ROIPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "roi_pool");
-    OP_INOUT_CHECK(ctx->HasInput("ROIs"), "Input", "ROIs", "roi_pool");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "roi_pool");
-    OP_INOUT_CHECK(ctx->HasOutput("Argmax"), "Output", "Argmax", "roi_pool");
-
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "The second dimension of RoisNum should "
-                            "be 1, but received dimension is %d",
-                            rois_num_dims.size()));
-    }
-    PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The input data should be a four-dimensional "
-                          "tensor with [N,C,H,W], but received input data with "
-                          " %d dimension",
-                          input_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        rois_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)"
-            "given as [[x1, y1, x2, y2], ...], but received ROIs is "
-            "%d-dimensional LoDTensor",
-            rois_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        rois_dims[1], kROISize,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)"
-            "given as [[x1, y1, x2, y2], ...]. But the second dimension of  "
-            "the received data is %d",
-            rois_dims[1]));
-
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::OutOfRange(
-                          "The pooled output height must be greater than 0"
-                          "but received height is %d",
-                          pooled_height));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::OutOfRange(
-                          "The pooled output width must be greater than 0"
-                          "but received width is %d",
-                          pooled_width));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::OutOfRange(
-                          "The spatial scale must be greater than 0, "
-                          "but received spatial scale is %f",
-                          spatial_scale));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("Argmax", out_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -212,20 +147,15 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roi_pool, RoiPoolInferShapeFunctor,
+                            PD_INFER_META(phi::RoiPoolInferMeta));
+
 REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
                   ops::ROIPoolGradMaker<paddle::framework::OpDesc>,
-                  ops::ROIPoolGradMaker<paddle::imperative::OpBase>);
+                  ops::ROIPoolGradMaker<paddle::imperative::OpBase>,
+                  RoiPoolInferShapeFunctor);
 REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, int>);
-REGISTER_OP_CPU_KERNEL(
-    roi_pool_grad,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+
 REGISTER_OP_VERSION(roi_pool)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
deleted file mode 100644
index b907b1114bbc0..0000000000000
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_pool_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaxinumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaxinumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUROIPoolForward(
-    const int nthreads, const T* input_data, const T* input_rois,
-    const float spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    const T* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = roi_batch_id_data[n];
-    int roi_start_w = round(offset_input_rois[0] * spatial_scale);
-    int roi_start_h = round(offset_input_rois[1] * spatial_scale);
-    int roi_end_w = round(offset_input_rois[2] * spatial_scale);
-    int roi_end_h = round(offset_input_rois[3] * spatial_scale);
-
-    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-
-    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
-                                        static_cast<double>(roi_height) /
-                                        static_cast<double>(pooled_height)));
-    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
-                                        static_cast<double>(roi_width) /
-                                        static_cast<double>(pooled_width)));
-    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
-                                     static_cast<double>(roi_height) /
-                                     static_cast<double>(pooled_height)));
-    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
-                                     static_cast<double>(roi_width) /
-                                     static_cast<double>(pooled_width)));
-    hstart = min(max(hstart + roi_start_h, 0), height);
-    hend = min(max(hend + roi_start_h, 0), height);
-    wstart = min(max(wstart + roi_start_w, 0), width);
-    wend = min(max(wend + roi_start_w, 0), width);
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
-    int maxidx = -1;
-    const T* offset_input_data =
-        input_data + (roi_batch_ind * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int input_data_index = h * width + w;
-        if (offset_input_data[input_data_index] > maxval) {
-          maxval = offset_input_data[input_data_index];
-          maxidx = input_data_index;
-        }
-      }
-    }
-    output_data[i] = maxval;
-    if (argmax_data) {
-      argmax_data[i] = maxidx;
-    }
-  }
-}
-
-template <typename T>
-__global__ void GPUROIPoolBackward(
-    const int nthreads, const T* input_rois, const T* output_grad,
-    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, int* roi_batch_id_data,
-    T* input_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % channels;
-    int n = i / pooled_width / pooled_height / channels;
-
-    int roi_batch_ind = roi_batch_id_data[n];
-    int input_offset = (roi_batch_ind * channels + c) * height * width;
-    int output_offset = (n * channels + c) * pooled_height * pooled_width;
-    const T* offset_output_grad = output_grad + output_offset;
-    T* offset_input_grad = input_grad + input_offset;
-    const int64_t* offset_argmax_data = argmax_data + output_offset;
-
-    int argmax = offset_argmax_data[ph * pooled_width + pw];
-    if (argmax != -1) {
-      platform::CudaAtomicAdd(
-          offset_input_grad + argmax,
-          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
-    }
-  }
-}
-
-template <typename Place, typename T>
-class GPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* argmax = ctx.Output<Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    auto in_stride = phi::stride(in_dims);
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto& dev_ctx = ctx.cuda_device_context();
-    auto gplace = ctx.GetPlace();
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      int rois_batch_size = rois_num_t->numel();
-
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be the same but "
-              "received batch size of input(ROIs) and input(X) is %d and %d "
-              "respectively.",
-              rois_batch_size, batch_size));
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace, rois_num_list.data(), gplace,
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_list[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of input(ROIs) and input(X) must be the same but "
-              "received batch size of input(ROIs) and input(X) is %d and %d "
-              "respectively.",
-              rois_batch_size, batch_size));
-
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        platform::errors::InvalidArgument(
-                            "The number of rois from input(ROIs) and its LOD "
-                            "must be the same. Received rois %d of input(ROIs) "
-                            "but the number of rois %d from its LOD is %d",
-                            rois_num, rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                 dev_ctx.stream());
-
-    GPUROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()),
-        argmax->mutable_data<int64_t>(ctx.GetPlace()));
-  }
-};
-
-template <typename Place, typename T>
-class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* rois_lod = ctx.Input<Tensor>("RoisNum");
-    auto* argmax = ctx.Input<Tensor>("Argmax");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (x_grad) {
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      auto cplace = platform::CPUPlace();
-      int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-
-      auto& dev_ctx = ctx.cuda_device_context();
-      auto gplace = ctx.GetPlace();
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-        int rois_batch_size = rois_num_t->numel();
-        std::vector<int> rois_num_list(rois_batch_size);
-        memory::Copy(cplace, rois_num_list.data(), gplace,
-                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_list[n]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-          start += rois_num_list[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        int rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-        }
-      }
-      int bytes = roi_batch_id_list.numel() * sizeof(int);
-      auto roi_ptr = memory::Alloc(dev_ctx, bytes);
-      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-      memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
-                   dev_ctx.stream());
-
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<Place, T> set_zero;
-      set_zero(dev_ctx, x_grad, static_cast<T>(0));
-
-      int output_grad_size = out_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            output_grad_size, rois->data<T>(), out_grad->data<T>(),
-            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
-            width, pooled_height, pooled_width, roi_id_data,
-            x_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    roi_pool_grad,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
deleted file mode 100644
index a104fd49eb3e0..0000000000000
--- a/paddle/fluid/operators/roi_pool_op.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <limits>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr int kROISize = 4;
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-
-    auto in_stride = phi::stride(in_dims);
-    auto argmax_stride = phi::stride(argmax->dims());
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    framework::Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("The rois_batch_size and imgs "
-                                            "batch_size must be the same."));
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size, batch_size,
-          platform::errors::InvalidArgument("The rois_batch_size and imgs "
-                                            "batch_size must be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num, rois_num_with_lod,
-          platform::errors::InvalidArgument("The rois_num from input "
-                                            "and lod must be the same."));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
-
-    const T* rois_data = rois->data<T>();
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = roi_batch_id_data[n];
-      int roi_start_w = round(rois_data[0] * spatial_scale);
-      int roi_start_h = round(rois_data[1] * spatial_scale);
-      int roi_end_w = round(rois_data[2] * spatial_scale);
-      int roi_end_h = round(rois_data[3] * spatial_scale);
-
-      // Force malformed ROIs to be 1x1
-      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
-      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
-
-      const float bin_size_h =
-          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-      const float bin_size_w =
-          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-
-      const T* batch_data = input_data + roi_batch_id * in_stride[0];
-
-      for (int c = 0; c < channels; ++c) {
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            //  Compute pooling region for this output unit:
-            //  start (included) = floor(ph * roi_height / pooled_height_)
-            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
-            int hstart =
-                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
-            int wstart =
-                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
-            int hend =
-                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
-            int wend =
-                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
-
-            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
-            hend = std::min(std::max(hend + roi_start_h, 0), height);
-            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
-            wend = std::min(std::max(wend + roi_start_w, 0), width);
-
-            const int pool_index = ph * pooled_width + pw;
-
-            // Define an empty pooling region to be zero
-            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            output_data[pool_index] =
-                is_empty ? 0 : -std::numeric_limits<T>::max();
-            argmax_data[pool_index] = -1;
-
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                const int index = h * width + w;
-                if (batch_data[index] > output_data[pool_index]) {
-                  output_data[pool_index] = batch_data[index];
-                  argmax_data[pool_index] = index;
-                }
-              }
-            }
-          }
-        }
-
-        batch_data += in_stride[1];
-        output_data += out_stride[1];
-        argmax_data += argmax_stride[1];
-      }
-      // Increment ROI data pointer
-      rois_data += roi_stride[0];
-    }
-    return;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-
-    if (in_grad) {
-      int rois_num = rois->dims()[0];
-      framework::Tensor roi_batch_id_list;
-      roi_batch_id_list.Resize({rois_num});
-      int* roi_batch_id_data =
-          roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-
-      int rois_batch_size;
-      if (ctx.HasInput("RoisNum")) {
-        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-        rois_batch_size = rois_num_t->numel();
-        auto* rois_num_data = rois_num_t->data<int>();
-        int start = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int i = start; i < start + rois_num_data[n]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-          start += rois_num_data[n];
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            roi_batch_id_data[i] = n;
-          }
-        }
-      }
-
-      const T* rois_data = rois->data<T>();
-      const T* out_grad_data = out_grad->data<T>();
-      const int64_t* argmax_data = argmax->data<int64_t>();
-      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
-               static_cast<T>(0));
-
-      auto in_stride = phi::stride(in->dims());
-      auto argmax_stride = phi::stride(argmax->dims());
-      auto roi_stride = phi::stride(rois->dims());
-      auto out_stride = phi::stride(out_grad->dims());
-
-      int channels = in->dims()[1];
-
-      for (int n = 0; n < rois_num; ++n) {
-        int roi_batch_idx = roi_batch_id_data[n];
-        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
-        for (int c = 0; c < channels; ++c) {
-          for (int ph = 0; ph < pooled_height; ++ph) {
-            for (int pw = 0; pw < pooled_width; ++pw) {
-              int pool_index = ph * pooled_width + pw;
-              if (argmax_data[pool_index] >= 0) {
-                auto index = argmax_data[pool_index];
-                batch_grad_data[index] += out_grad_data[pool_index];
-              }
-            }
-          }
-          batch_grad_data += in_stride[1];
-          out_grad_data += out_stride[1];
-          argmax_data += argmax_stride[1];
-        }
-        rois_data += roi_stride[0];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc
index f82510556fde8..898db4c22fed9 100644
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/roll_op.h"
-
 #include <memory>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of RollOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Out) of RollOp should not be null."));
-
-    auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
-    auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
-
-    if (!ctx->HasInput("ShiftsTensor")) {
-      if (dims.size() != 0) {
-        PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
-                          platform::errors::InvalidArgument(
-                              "When dims.size() != 0, dims.size() "
-                              "should be equal to "
-                              "shifts.size(). But received "
-                              "dims.size() = %d, shifts.size() = %d",
-                              dims.size(), shifts.size()));
-      } else {
-        PADDLE_ENFORCE_EQ(shifts.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "When dims.size() == 0, shifts.size() "
-                              "should be equal to 1, But received "
-                              "shifts.size() = %d",
-                              shifts.size()));
-      }
-    }
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    auto type = ctx->GetInputsVarType("X")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor,
+                            PD_INFER_META(phi::RollInferMeta));
+
 REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker,
                   ops::RollGradMaker<paddle::framework::OpDesc>,
-                  ops::RollGradMaker<paddle::imperative::OpBase>);
+                  ops::RollGradMaker<paddle::imperative::OpBase>,
+                  RollInferShapeFunctor);
 REGISTER_OPERATOR(roll_grad, ops::RollGradOp,
                   ops::RollGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(roll)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu
deleted file mode 100644
index b9064c5450f9f..0000000000000
--- a/paddle/fluid/operators/roll_op.cu
+++ /dev/null
@@ -1,225 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/roll_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/core/utils/array.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, size_t Rank>
-__global__ void RollCudaKernel(const T* input, T* output, int64_t N,
-                               phi::Array<int64_t, Rank> shifts,
-                               phi::Array<int64_t, Rank> strides,
-                               phi::Array<int64_t, Rank> sizes) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int64_t output_idx = idx;
-  int64_t new_dim_idx = 0;
-
-#pragma unroll
-  for (size_t i = 0; i < Rank; i++) {
-    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
-    if (new_dim_idx >= sizes[i]) {
-      output_idx += (shifts[i] - sizes[i]) * strides[i];
-    } else {
-      output_idx += shifts[i] * strides[i];
-    }
-  }
-  output[output_idx] = input[idx];
-}
-
-template <typename T>
-class RollKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = (shifts[0] % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-
-        if (size != 0) {
-          shifts[i] = (shifts[i] % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-
-#define CALL_ROLL_CUDA_KERNEL(N)                                               \
-  case N: {                                                                    \
-    phi::Array<int64_t, N> _strides;                                           \
-    phi::Array<int64_t, N> _shifts;                                            \
-    phi::Array<int64_t, N> _sizes;                                             \
-    for (size_t idx = 0; idx < N; ++idx) {                                     \
-      _strides[idx] = strides[idx];                                            \
-      _shifts[idx] = shifts[idx];                                              \
-      _sizes[idx] = sizes[idx];                                                \
-    }                                                                          \
-    RollCudaKernel<                                                            \
-        T,                                                                     \
-        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,  \
-             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel,   \
-                                                   _shifts, _strides, _sizes); \
-    break;                                                                     \
-  }
-
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-
-template <typename T>
-class RollGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* out = context.Output<LoDTensor>(framework::GradVarName("X"));
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    auto* in_data = in->data<T>();
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    int64_t numel = in->numel();
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
-    size_t nums = shifts.size();
-    auto input_dim = in->dims();
-    auto stride_dim = phi::stride(input_dim);
-
-    std::vector<int64_t> strides(nums), sizes(nums);
-    if (dims.size() == 0) {
-      strides[0] = 1;
-      sizes[0] = numel;
-      shifts[0] = ((-shifts[0]) % numel + numel) % numel;
-    } else {
-      for (size_t i = 0; i < nums; i++) {
-        int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size();
-        int64_t size = input_dim[dim];
-        if (size != 0) {
-          shifts[i] = ((-shifts[i]) % size + size) % size;
-          strides[i] = stride_dim[dim];
-          sizes[i] = size;
-        }
-      }
-    }
-
-    switch (nums) {
-      CALL_ROLL_CUDA_KERNEL(1);
-      CALL_ROLL_CUDA_KERNEL(2);
-      CALL_ROLL_CUDA_KERNEL(3);
-      CALL_ROLL_CUDA_KERNEL(4);
-      CALL_ROLL_CUDA_KERNEL(5);
-      CALL_ROLL_CUDA_KERNEL(6);
-      CALL_ROLL_CUDA_KERNEL(7);
-      CALL_ROLL_CUDA_KERNEL(8);
-      CALL_ROLL_CUDA_KERNEL(9);
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "shifts.size() should be less than 10, But received shifts.size() "
-            "= %d",
-            shifts.size()));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    roll, ops::RollKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<float>>,
-    ops::RollKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    roll_grad, ops::RollGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::RollGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h
deleted file mode 100644
index 413c7bcfc15eb..0000000000000
--- a/paddle/fluid/operators/roll_op.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-template <typename T>
-inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim,
-                            int64_t shift) {
-  if (dim < 0) {
-    dim += input_dim.size();
-  }
-  if (input_dim[dim] == 0) {
-    return;
-  }
-  shift = shift % input_dim[dim];
-  if (shift < 0) {
-    shift += input_dim[dim];
-  }
-
-  auto outer_loops = 1;
-  for (auto i = 0; i < dim; i++) {
-    outer_loops *= input_dim[i];
-  }
-  auto slice_width = 1;
-  for (auto i = dim + 1; i < input_dim.size(); i++) {
-    slice_width *= input_dim[i];
-  }
-
-  VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim
-          << "; dim: " << dim << "; shift: " << shift
-          << "; outer_loops: " << outer_loops
-          << "; slice_width: " << slice_width;
-  if (shift == 0) {
-    return;
-  }
-
-  std::vector<T> head;
-  auto head_size = slice_width * (input_dim[dim] - shift);
-  head.resize(head_size);
-
-  for (auto i = 0; i < outer_loops; i++) {
-    for (auto j = 0; j < head_size; j++) {
-      head[j] = data[i * input_dim[dim] * slice_width + j];
-    }
-    for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) {
-      auto dst_pos = j - input_dim[dim] + shift;
-      for (auto k = 0; k < slice_width; k++) {
-        data[(i * input_dim[dim] + dst_pos) * slice_width + k] =
-            data[(i * input_dim[dim] + j) * slice_width + k];
-      }
-    }
-    for (auto j = 0; j < head_size; j++) {
-      data[(i * input_dim[dim] + shift) * slice_width + j] = head[j];
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class RollKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_var = context.InputVar("X");
-    auto* output_var = context.OutputVar("Out");
-    auto& input = input_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      PADDLE_ENFORCE_EQ(
-          shifts_tensor->dims().size(), 1,
-          platform::errors::InvalidArgument(
-              "The rank of ShiftsTensor is expected to be 1, got %s",
-              shifts_tensor->dims().size()));
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    std::vector<T> out_vec;
-    paddle::framework::TensorToVector(input, context.device_context(),
-                                      &out_vec);
-
-    size_t nums = shifts.size();
-    DDim input_dim = input.dims();
-
-    // axis = none, reshape to 1-D tensor
-    if (dims.size() == 0) {
-      dims.push_back(0l);
-      input_dim = framework::Dim<1>(out_vec.size());
-    }
-
-    for (size_t i = 0; i < nums; i++) {
-      PADDLE_ENFORCE_EQ(
-          dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), true,
-          platform::errors::OutOfRange(
-              "Attr(axis[%d]) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
-              i, input_dim.size(), input_dim.size() - 1, i, dims[i]));
-      shift_along_dim(out_vec.data(), input_dim, dims[i], shifts[i]);
-    }
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input.dims());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RollGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_var = context.InputVar(framework::GradVarName("Out"));
-    auto* output_var = context.OutputVar(framework::GradVarName("X"));
-    auto& input = input_var->Get<LoDTensor>();
-    auto* output = output_var->GetMutable<LoDTensor>();
-    std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    if (context.HasInput("ShiftsTensor")) {
-      const auto* shifts_tensor =
-          context.Input<framework::Tensor>("ShiftsTensor");
-      shifts = GetDataFromTensor<int64_t>(shifts_tensor);
-    }
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
-
-    std::vector<T> out_vec;
-    paddle::framework::TensorToVector(input, context.device_context(),
-                                      &out_vec);
-
-    size_t nums = shifts.size();
-    DDim input_dim = input.dims();
-
-    // axis = none, reshape to 1-D tensor
-    if (dims.size() == 0) {
-      dims.push_back(0l);
-      input_dim = framework::Dim<1>(out_vec.size());
-    }
-
-    for (size_t i = 0; i < nums; i++) {
-      shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]);
-    }
-    output->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(out_vec, context.device_context(), output);
-    output->Resize(input.dims());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc
index d0290795455db..3a6fdbaa2613d 100644
--- a/paddle/fluid/operators/searchsorted_op.cc
+++ b/paddle/fluid/operators/searchsorted_op.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,60 +23,6 @@ namespace operators {
 class SearchSortedOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  static bool SearchsortedDimsMatchedBeforeLastDim(
-      const framework::DDim& sequences_dims,
-      const framework::DDim& values_dims) {
-    if (sequences_dims.size() != values_dims.size()) {
-      return false;
-    }
-    const auto& sequences_dims_size = sequences_dims.size();
-    for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) {
-      if (sequences_dims[dim] != values_dims[dim]) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("SortedSequence"), "Input", "SortedSequence",
-                   "searchsorted");
-    OP_INOUT_CHECK(ctx->HasInput("Values"), "Input", "Values", "searchsorted");
-
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "searchsorted");
-
-    auto sequences_dims = ctx->GetInputDim("SortedSequence");
-    auto values_dims = ctx->GetInputDim("Values");
-    auto out_int32 = ctx->Attrs().Get<bool>("out_int32");
-
-    if (sequences_dims.size() != 1) {
-      PADDLE_ENFORCE_EQ(
-          SearchsortedDimsMatchedBeforeLastDim(sequences_dims, values_dims),
-          true,
-          platform::errors::Unavailable(
-              "The dimensions of sorted_sequence tensor ( %s ) and values "
-              "tensor ( %s ) can not match. Because the input sorted_sequence "
-              "tensor must be 1 dimension or the first N-1 dimensions of "
-              "sorted_sequence tensor and input values tensor must match. "
-              "Please input appropriate sorted_sequence and values again! ",
-              sequences_dims, values_dims));
-    }
-
-    if (out_int32) {
-      PADDLE_ENFORCE_LT(
-          sequences_dims[sequences_dims.size() - 1],
-          std::numeric_limits<int>::max(),
-          platform::errors::Unavailable(
-              "The size of sorted_sequence %d exceed the maximum limit d%. "
-              "Because the size of sorted_sequence should be less than the "
-              "output maximum value for int32 bit. Please set appropriate "
-              "sorted_sequence to meet this requirement! ",
-              sequences_dims[sequences_dims.size() - 1],
-              std::numeric_limits<int>::max()));
-    }
-
-    ctx->SetOutputDim("Out", values_dims);
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -115,4 +63,7 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker);
+DECLARE_INFER_SHAPE_FUNCTOR(searchsorted, SearchsortedInferShapeFunctor,
+                            PD_INFER_META(phi::SearchsortedInferMeta));
+REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker,
+                  SearchsortedInferShapeFunctor);
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 513ab46e9b5ee..73655bcb18500 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -13,9 +13,15 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/set_value_op.h"
+
 #include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -34,6 +40,8 @@ class CPUDeviceContext;
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 class SetValue : public framework::OperatorWithKernel {
  public:
   SetValue(const std::string &type, const framework::VariableNameMap &inputs,
@@ -41,17 +49,6 @@ class SetValue : public framework::OperatorWithKernel {
            const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "SetValue");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SetValue");
-    auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_LT(
-        in_dims.size(), 7,
-        platform::errors::InvalidArgument(
-            "The rank of input should be less than 7, but received %d.",
-            in_dims.size()));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -236,10 +233,13 @@ DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"});
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DECLARE_INFER_SHAPE_FUNCTOR(set_value, SetValueInferShapeFunctor,
+                            PD_INFER_META(phi::SetValueInferMeta));
+
 REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
                   ops::SetValueGradMaker<paddle::framework::OpDesc>,
                   ops::SetValueGradMaker<paddle::imperative::OpBase>,
-                  ops::SetValueOpInplaceInferer);
+                  ops::SetValueOpInplaceInferer, SetValueInferShapeFunctor);
 
 REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad);
 
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index e2c8359beb129..9001ce5d51dec 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,17 +25,6 @@ class ShapeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      platform::errors::InvalidArgument(
-                          "Input (Input) of get_shape op should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output (Out) of get_shape op should not be null."));
-    auto in_dim = ctx->GetInputDim("Input");
-    ctx->SetOutputDim("Out", {in_dim.size()});
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
@@ -89,7 +81,12 @@ Return the shape of the input.
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
+DECLARE_INFER_SHAPE_FUNCTOR(shape, ShapeInferShapeFunctor,
+                            PD_INFER_META(phi::ShapeInferMeta));
+
 REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ShapeInferShapeFunctor);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 19a395e72314d..41545a1ca20b2 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -760,8 +760,9 @@ static void SoftmaxWithCrossEntropyHardLabel(
 */
 template <typename T, typename LabelT>
 __global__ void SoftmaxWithCrossEntropyGradHardLabel(
-    T* logits_grad, const T* loss_grad, const LabelT* labels, const int64_t n,
-    const int64_t dim, const int64_t d, const int ignore_index) {
+    T* logits_grad, const T* loss_grad, const T* softmax, const LabelT* labels,
+    const int64_t n, const int64_t dim, const int64_t d,
+    const int ignore_index) {
   int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   int64_t idx_n = idx / (d * dim);
   int64_t idx_dim = (idx / d) % dim;
@@ -773,10 +774,9 @@ __global__ void SoftmaxWithCrossEntropyGradHardLabel(
     if (lbl == ignore_index) {
       logits_grad[idx] = static_cast<T>(0.0);
     } else if (lbl == idx_dim) {
-      logits_grad[idx] =
-          (logits_grad[idx] - static_cast<T>(1.0)) * loss_grad[ids];
+      logits_grad[idx] = (softmax[idx] - static_cast<T>(1.0)) * loss_grad[ids];
     } else {
-      logits_grad[idx] *= loss_grad[ids];
+      logits_grad[idx] = softmax[idx] * loss_grad[ids];
     }
   }
 }
@@ -1395,11 +1395,20 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
     const Tensor* softmax = context.Input<Tensor>("Softmax");
-    if (logit_grad != softmax) {
+    auto stream = context.cuda_device_context().stream();
+    auto ignore_index = context.Attr<int>("ignore_index");
+    auto use_softmax = context.Attr<bool>("use_softmax");
+
+    T* logit_grad_data = nullptr;
+    bool copy_flag = (logit_grad != softmax && (!use_softmax || soft_label));
+    if (copy_flag) {
       framework::TensorCopy(*softmax, context.GetPlace(),
                             context.device_context(), logit_grad);
+      logit_grad_data = logit_grad->template data<T>();
+    } else {
+      logit_grad_data =
+          logit_grad->template mutable_data<T>(context.GetPlace());
     }
-    T* logit_grad_data = logit_grad->template data<T>();
 
     const int rank = logit_grad->dims().size();
     const int axis = phi::funcs::CanonicalAxis(context.Attr<int>("axis"), rank);
@@ -1414,9 +1423,6 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 #else
     int block = 512;
 #endif
-    auto stream = context.cuda_device_context().stream();
-    auto ignore_index = context.Attr<int>("ignore_index");
-    auto use_softmax = context.Attr<bool>("use_softmax");
 
     // do not with softmax op, and input is softmax
     if (!use_softmax) {
@@ -1451,11 +1457,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
       SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
           logit_grad_data, loss_grad_data, label_data, n, d, remain);
     } else {
+      const T* softmax_data = softmax->template data<T>();
       const auto* label_data = labels.template data<LabelT>();
       int grid = (n * d + block - 1) / block;
       SoftmaxWithCrossEntropyGradHardLabel<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, loss_grad_data, label_data, n, d / remain, remain,
-          ignore_index);
+          logit_grad_data, loss_grad_data, softmax_data, label_data, n,
+          d / remain, remain, ignore_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h
index 39639768241d4..c9889ad539d08 100644
--- a/paddle/fluid/operators/spectral_helper.h
+++ b/paddle/fluid/operators/spectral_helper.h
@@ -16,451 +16,469 @@
 
 #include "paddle/fluid/operators/spectral_op.h"
 
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/dynload/hipfft.h"
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/dynload/cufft.h"
+#if defined(PADDLE_WITH_ONEMKL)
+#include "paddle/phi/backends/dynload/mklrt.h"
+#elif defined(PADDLE_WITH_POCKETFFT)
+#include "extern_pocketfft/pocketfft_hdronly.h"
 #endif
 
 namespace paddle {
 namespace operators {
-using ScalarType = framework::proto::VarType::Type;
-const int64_t kMaxFFTNdim = 3;
-const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
-// This struct is used to easily compute hashes of the
-// parameters. It will be the **key** to the plan cache.
-struct FFTConfigKey {
-  // between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3
-  int64_t signal_ndim_;
-  // These include additional batch dimension as well.
-  int64_t sizes_[kMaxDataNdim];
-  int64_t input_shape_[kMaxDataNdim];
-  int64_t output_shape_[kMaxDataNdim];
-  FFTTransformType fft_type_;
-  ScalarType value_type_;
-
-  FFTConfigKey() = default;
-
-  FFTConfigKey(const std::vector<int64_t>& in_shape,
-               const std::vector<int64_t>& out_shape,
-               const std::vector<int64_t>& signal_size,
-               FFTTransformType fft_type, ScalarType value_type) {
-    // Padding bits must be zeroed for hashing
-    memset(this, 0, sizeof(*this));
-    signal_ndim_ = signal_size.size() - 1;
-    fft_type_ = fft_type;
-    value_type_ = value_type;
-
-    std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
-    std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
-    std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
-  }
-};
-
-#if defined(PADDLE_WITH_CUDA)
-// An RAII encapsulation of cuFFTHandle
-class CuFFTHandle {
-  ::cufftHandle handle_;
 
- public:
-  CuFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_));
-  }
-
-  CuFFTHandle(const CuFFTHandle& other) = delete;
-  CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
+using Tensor = framework::Tensor;
 
-  CuFFTHandle(CuFFTHandle&& other) = delete;
-  CuFFTHandle& operator=(CuFFTHandle&& other) = delete;
+// FFT Functors
+#if defined(PADDLE_WITH_ONEMKL)
 
-  ::cufftHandle& get() { return handle_; }
-  const ::cufftHandle& get() const { return handle_; }
+#define MKL_DFTI_CHECK(expr)                                                   \
+  do {                                                                         \
+    MKL_LONG status = (expr);                                                  \
+    if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR))                  \
+      PADDLE_THROW(                                                            \
+          platform::errors::External(phi::dynload::DftiErrorMessage(status))); \
+  } while (0);
 
-  ~CuFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_));
+struct DftiDescriptorDeleter {
+  void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
+    if (handle != nullptr) {
+      MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
+    }
   }
 };
 
-using plan_size_type = long long int;  // NOLINT
-// This class contains all the information needed to execute a cuFFT plan:
-//   1. the plan
-//   2. the workspace size needed
-class FFTConfig {
+// A RAII wrapper for MKL_DESCRIPTOR*
+class DftiDescriptor {
  public:
-  // Only move semantics is enought for this class. Although we already use
-  // unique_ptr for the plan, still remove copy constructor and assignment op so
-  // we don't accidentally copy and take perf hit.
-  explicit FFTConfig(const FFTConfigKey& plan_key)
-      : FFTConfig(
-            std::vector<int64_t>(plan_key.sizes_,
-                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
-            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
-
-  // sizes are full signal, including batch size and always two-sided
-  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
-            FFTTransformType fft_type, ScalarType dtype)
-      : fft_type_(fft_type), value_type_(dtype) {
-    // signal sizes (excluding batch dim)
-    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
-
-    // input batch size
-    const auto batch = static_cast<plan_size_type>(sizes[0]);
-    // const int64_t signal_ndim = sizes.size() - 1;
-    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
-                      platform::errors::InvalidArgument(
-                          "The signal_ndim must be equal to sizes.size() - 1,"
-                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
-                          signal_ndim, sizes.size() - 1));
-
-    cudaDataType itype, otype, exec_type;
-    const auto complex_input = has_complex_input(fft_type);
-    const auto complex_output = has_complex_output(fft_type);
-    if (dtype == framework::proto::VarType::FP32) {
-      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
-      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
-      exec_type = CUDA_C_32F;
-    } else if (dtype == framework::proto::VarType::FP64) {
-      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
-      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
-      exec_type = CUDA_C_64F;
-    } else if (dtype == framework::proto::VarType::FP16) {
-      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
-      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
-      exec_type = CUDA_C_16F;
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "cuFFT only support transforms of type float16, float32 and "
-          "float64"));
-    }
-
-    // disable auto allocation of workspace to use allocator from the framework
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation(
-        plan(), /* autoAllocate */ 0));
-
-    size_t ws_size_t;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany(
-        plan(), signal_ndim, signal_sizes.data(),
-        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
-        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
-        batch, &ws_size_t, exec_type));
-
-    ws_size = ws_size_t;
+  void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type,
+            MKL_LONG signal_ndim, MKL_LONG* sizes) {
+    PADDLE_ENFORCE_EQ(desc_.get(), nullptr,
+                      platform::errors::AlreadyExists(
+                          "DftiDescriptor has already been initialized."));
+
+    DFTI_DESCRIPTOR* raw_desc;
+    MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
+        &raw_desc, precision, signal_type, signal_ndim, sizes));
+    desc_.reset(raw_desc);
   }
 
-  FFTConfig(const FFTConfig& other) = delete;
-  FFTConfig& operator=(const FFTConfig& other) = delete;
-
-  FFTConfig(FFTConfig&& other) = delete;
-  FFTConfig& operator=(FFTConfig&& other) = delete;
-
-  const cufftHandle& plan() const { return plan_ptr.get(); }
-
-  FFTTransformType transform_type() const { return fft_type_; }
-  ScalarType data_type() const { return value_type_; }
-  size_t workspace_size() const { return ws_size; }
+  DFTI_DESCRIPTOR* get() const {
+    DFTI_DESCRIPTOR* raw_desc = desc_.get();
+    PADDLE_ENFORCE_NOT_NULL(raw_desc,
+                            platform::errors::PreconditionNotMet(
+                                "DFTI DESCRIPTOR has not been initialized."));
+    return raw_desc;
+  }
 
  private:
-  CuFFTHandle plan_ptr;
-  size_t ws_size;
-  FFTTransformType fft_type_;
-  ScalarType value_type_;
+  std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
 };
 
-#elif defined(PADDLE_WITH_HIP)
-// An RAII encapsulation of cuFFTHandle
-class HIPFFTHandle {
-  ::hipfftHandle handle_;
-
- public:
-  HIPFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_));
+static DftiDescriptor _plan_mkl_fft(
+    const framework::proto::VarType::Type& in_dtype,
+    const framework::proto::VarType::Type& out_dtype,
+    const framework::DDim& in_strides, const framework::DDim& out_strides,
+    const std::vector<int>& signal_sizes, FFTNormMode normalization,
+    bool forward) {
+  const DFTI_CONFIG_VALUE precision = [&] {
+    switch (in_dtype) {
+      case framework::proto::VarType::FP32:
+        return DFTI_SINGLE;
+      case framework::proto::VarType::COMPLEX64:
+        return DFTI_SINGLE;
+      case framework::proto::VarType::FP64:
+        return DFTI_DOUBLE;
+      case framework::proto::VarType::COMPLEX128:
+        return DFTI_DOUBLE;
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid input datatype (%s), input data type should be FP32, "
+            "FP64, COMPLEX64 or COMPLEX128.",
+            framework::DataTypeToString(in_dtype)));
+    }
+  }();
+
+  // C2C, R2C, C2R
+  const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype);
+  const DFTI_CONFIG_VALUE domain =
+      (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;
+
+  DftiDescriptor descriptor;
+  std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
+  const MKL_LONG signal_ndim = fft_sizes.size() - 1;
+  descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
+
+  // placement inplace or not inplace
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT,
+                                            DFTI_NOT_INPLACE));
+
+  // number of transformations
+  const MKL_LONG batch_size = fft_sizes[0];
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+      descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
+
+  // input & output distance
+  const MKL_LONG idist = in_strides[0];
+  const MKL_LONG odist = out_strides[0];
+  MKL_DFTI_CHECK(
+      phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(),
+                                            DFTI_OUTPUT_DISTANCE, odist));
+
+  // input & output stride
+  std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
+  std::vector<MKL_LONG> mkl_out_stride(1 + signal_ndim, 0);
+  for (MKL_LONG i = 1; i <= signal_ndim; i++) {
+    mkl_in_stride[i] = in_strides[i];
+    mkl_out_stride[i] = out_strides[i];
   }
-
-  HIPFFTHandle(const HIPFFTHandle& other) = delete;
-  HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete;
-
-  HIPFFTHandle(HIPFFTHandle&& other) = delete;
-  HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete;
-
-  ::hipfftHandle& get() { return handle_; }
-  const ::hipfftHandle& get() const { return handle_; }
-
-  ~HIPFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+      descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
+  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+      descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
+
+  // conjugate even storage
+  if (!(fft_type == FFTTransformType::C2C)) {
+    MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+        descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
   }
-};
-using plan_size_type = int;
-// This class contains all the information needed to execute a cuFFT plan:
-//   1. the plan
-//   2. the workspace size needed
-class FFTConfig {
- public:
-  // Only move semantics is enought for this class. Although we already use
-  // unique_ptr for the plan, still remove copy constructor and assignment op so
-  // we don't accidentally copy and take perf hit.
-  explicit FFTConfig(const FFTConfigKey& plan_key)
-      : FFTConfig(
-            std::vector<int64_t>(plan_key.sizes_,
-                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
-            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
-
-  // sizes are full signal, including batch size and always two-sided
-  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
-            FFTTransformType fft_type, ScalarType dtype)
-      : fft_type_(fft_type), value_type_(dtype) {
-    // signal sizes (excluding batch dim)
-    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
-
-    // input batch size
-    const auto batch = static_cast<plan_size_type>(sizes[0]);
-    // const int64_t signal_ndim = sizes.size() - 1;
-    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
-                      platform::errors::InvalidArgument(
-                          "The signal_ndim must be equal to sizes.size() - 1,"
-                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
-                          signal_ndim, sizes.size() - 1));
-
-    hipfftType exec_type = [&] {
-      if (dtype == framework::proto::VarType::FP32) {
-        switch (fft_type) {
-          case FFTTransformType::C2C:
-            return HIPFFT_C2C;
-          case FFTTransformType::R2C:
-            return HIPFFT_R2C;
-          case FFTTransformType::C2R:
-            return HIPFFT_C2R;
-        }
-      } else if (dtype == framework::proto::VarType::FP64) {
-        switch (fft_type) {
-          case FFTTransformType::C2C:
-            return HIPFFT_Z2Z;
-          case FFTTransformType::R2C:
-            return HIPFFT_D2Z;
-          case FFTTransformType::C2R:
-            return HIPFFT_Z2D;
-        }
+
+  MKL_LONG signal_numel =
+      std::accumulate(fft_sizes.cbegin() + 1, fft_sizes.cend(), 1UL,
+                      std::multiplies<MKL_LONG>());
+  if (normalization != FFTNormMode::none) {
+    const double scale =
+        ((normalization == FFTNormMode::by_sqrt_n)
+             ? 1.0 / std::sqrt(static_cast<double>(signal_numel))
+             : 1.0 / static_cast<double>(signal_numel));
+    const auto scale_direction = [&]() {
+      if (fft_type == FFTTransformType::R2C ||
+          (fft_type == FFTTransformType::C2C && forward)) {
+        return DFTI_FORWARD_SCALE;
+      } else {
+        // (fft_type == FFTTransformType::C2R ||
+        //          (fft_type == FFTTransformType::C2C && !forward))
+        return DFTI_BACKWARD_SCALE;
       }
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "hipFFT only support transforms of type float32 and float64"));
     }();
-
-    // disable auto allocation of workspace to use allocator from the framework
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
-        plan(), /* autoAllocate */ 0));
-
-    size_t ws_size_t;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany(
-        plan(), signal_ndim, signal_sizes.data(),
-        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
-        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
-        batch, &ws_size_t));
-
-    ws_size = ws_size_t;
+    MKL_DFTI_CHECK(
+        phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
   }
 
-  const hipfftHandle& plan() const { return plan_ptr.get(); }
-
-  FFTTransformType transform_type() const { return fft_type_; }
-  ScalarType data_type() const { return value_type_; }
-  size_t workspace_size() const { return ws_size; }
+  // commit the descriptor
+  MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
+  return descriptor;
+}
 
- private:
-  HIPFFTHandle plan_ptr;
-  size_t ws_size;
-  FFTTransformType fft_type_;
-  ScalarType value_type_;
-};
-#endif
+// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
+template <typename DeviceContext, typename Ti, typename To>
+void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
+              const std::vector<int64_t>& axes, FFTNormMode normalization,
+              bool forward) {
+  const framework::DDim& in_sizes = x->dims();
+  const int ndim = in_sizes.size();
+  const int signal_ndim = axes.size();
+  const int batch_ndim = ndim - signal_ndim;
+  const framework::DDim& out_sizes = out->dims();
+
+  // make a dim permutation
+  std::vector<int> dim_permute(ndim);
+  std::iota(dim_permute.begin(), dim_permute.end(), 0);
+  std::vector<bool> is_transformed_dim(ndim, false);
+  for (const auto& d : axes) {
+    is_transformed_dim[d] = true;
+  }
+  const auto batch_end =
+      std::partition(dim_permute.begin(), dim_permute.end(),
+                     [&](size_t axis) { return !is_transformed_dim[axis]; });
+  std::copy(axes.cbegin(), axes.cend(), batch_end);
+
+  // transpose input according to that permutation
+  framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute);
+  std::vector<int64_t> transposed_input_shape_ =
+      phi::vectorize(transposed_input_shape);
+  framework::Tensor transposed_input;
+  transposed_input.Resize(transposed_input_shape);
+  const auto place = ctx.GetPlace();
+  transposed_input.mutable_data<Ti>(place);
+  TransCompute<platform::CPUDeviceContext, Ti>(ndim, ctx, *x, &transposed_input,
+                                               dim_permute);
+
+  // make an collapsed input: collapse batch axes for input
+  const int batch_size = std::accumulate(
+      transposed_input_shape.Get(), transposed_input_shape.Get() + batch_ndim,
+      1L, std::multiplies<int64_t>());
+  std::vector<int> collapsed_input_shape_(1 + signal_ndim);
+  collapsed_input_shape_[0] = batch_size;
+  std::copy(transposed_input_shape_.begin() + batch_ndim,
+            transposed_input_shape_.end(), collapsed_input_shape_.begin() + 1);
+  const framework::DDim collapsed_input_shape =
+      phi::make_ddim(collapsed_input_shape_);
+  transposed_input.Resize(collapsed_input_shape);
+  framework::Tensor& collapsed_input = transposed_input;
+
+  // make a collapsed output
+  std::vector<int> collapsed_output_shape_(1 + signal_ndim);
+  collapsed_output_shape_[0] = batch_size;
+  for (int i = 0; i < signal_ndim; i++) {
+    collapsed_output_shape_[1 + i] = out_sizes[axes[i]];
+  }
+  const framework::DDim collapsed_output_shape =
+      phi::make_ddim(collapsed_output_shape_);
+  framework::Tensor collapsed_output;
+  collapsed_output.Resize(collapsed_output_shape);
+  collapsed_output.mutable_data(place, out->type());
+
+  // signal sizes
+  std::vector<int> signal_sizes(1 + signal_ndim);
+  signal_sizes[0] = batch_size;
+  for (int i = 0; i < signal_ndim; i++) {
+    signal_sizes[1 + i] =
+        std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
+  }
 
-// Hashing machinery for Key
-// Fowler–Noll–Vo hash function
-// see
-// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
-template <typename Key>
-struct KeyHash {
-  // Key must be a POD because we read out its memory
-  // contenst as char* when hashing
-  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
-
-  size_t operator()(const Key& params) const {
-    auto ptr = reinterpret_cast<const uint8_t*>(&params);
-    uint32_t value = 0x811C9DC5;
-    for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) {
-      value ^= ptr[i];
-      value *= 0x01000193;
+  // input & output stride
+  const framework::DDim input_stride = phi::stride(collapsed_input_shape);
+  const framework::DDim output_stride = phi::stride(collapsed_output_shape);
+
+  // make a DFTI_DESCRIPTOR
+  DftiDescriptor desc =
+      _plan_mkl_fft(framework::TransToProtoVarType(x->dtype()),
+                    framework::TransToProtoVarType(out->dtype()), input_stride,
+                    output_stride, signal_sizes, normalization, forward);
+
+  const FFTTransformType fft_type =
+      GetFFTTransformType(framework::TransToProtoVarType(x->dtype()),
+                          framework::TransToProtoVarType(out->type()));
+  if (fft_type == FFTTransformType::C2R && forward) {
+    framework::Tensor collapsed_input_conj(collapsed_input.dtype());
+    collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
+                                          ctx.GetPlace());
+    // conjugate the input
+    platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
+    phi::funcs::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
+                                        collapsed_input.numel(),
+                                        collapsed_input_conj.data<Ti>());
+    for_range(functor);
+    MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
+        desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    framework::Tensor collapsed_output_conj(collapsed_output.dtype());
+    collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
+                                           ctx.GetPlace());
+    MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
+        desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
+    // conjugate the output
+    platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
+    phi::funcs::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
+                                        collapsed_output.numel(),
+                                        collapsed_output.data<To>());
+    for_range(functor);
+  } else {
+    if (forward) {
+      MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
+          desc.get(), collapsed_input.data(), collapsed_output.data()));
+    } else {
+      MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
+          desc.get(), collapsed_input.data(), collapsed_output.data()));
     }
-    return static_cast<size_t>(value);
   }
-};
 
-template <typename Key>
-struct KeyEqual {
-  // Key must be a POD because we read out its memory
-  // contenst as char* when comparing
-  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+  // resize for the collapsed output
+  framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
+  collapsed_output.Resize(transposed_output_shape);
+  framework::Tensor& transposed_output = collapsed_output;
 
-  bool operator()(const Key& a, const Key& b) const {
-    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
-    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
-    return memcmp(ptr1, ptr2, sizeof(Key)) == 0;
+  // reverse the transposition
+  std::vector<int> reverse_dim_permute(ndim);
+  for (int i = 0; i < ndim; i++) {
+    reverse_dim_permute[dim_permute[i]] = i;
   }
-};
-
-#if CUDA_VERSION < 10000
-// Note that the max plan number for CUDA version < 10 has to be 1023
-// due to a bug that fails on the 1024th plan
-constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
-constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
-#else
-constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
-// The default max cache size chosen for CUDA version > 10 is arbitrary.
-// This number puts a limit on how big of a plan cache should we maintain by
-// default. Users can always configure it via cufft_set_plan_cache_max_size.
-constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
-#endif
-static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
-                  CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
-              "CUFFT_MAX_PLAN_NUM not in size_t range");
-static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
-                  CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
-              "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
-
-// This cache assumes that the mapping from key to value never changes.
-// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
-// value returned from try_emplace_value.
-// The contract of using this cache is that try_emplace_value should only be
-// used when the max_size is positive.
-class FFTConfigCache {
- public:
-  using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
-  using map_t = typename std::unordered_map<
-      std::reference_wrapper<FFTConfigKey>, typename std::list<kv_t>::iterator,
-      KeyHash<FFTConfigKey>, KeyEqual<FFTConfigKey>>;
-  using map_kkv_iter_t = typename map_t::iterator;
-
-  FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
-
-  explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
-
-  FFTConfigCache(const FFTConfigCache& other) = delete;
-  FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
-
-  FFTConfigCache(FFTConfigCache&& other) noexcept
-      : _usage_list(std::move(other._usage_list)),
-        _cache_map(std::move(other._cache_map)),
-        _max_size(other._max_size) {}
+  TransCompute<platform::CPUDeviceContext, To>(ndim, ctx, transposed_output,
+                                               out, reverse_dim_permute);
+}
 
-  FFTConfigCache& operator=(FFTConfigCache&& other) noexcept {
-    _usage_list = std::move(other._usage_list);
-    _cache_map = std::move(other._cache_map);
-    _max_size = other._max_size;
-    return *this;
+template <typename Ti, typename To>
+struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
+                                                 normalization, forward);
   }
+};
 
-  // If key is in this cache, return the cached config. Otherwise, emplace the
-  // config in this cache and return it.
-  FFTConfig& lookup(FFTConfigKey params) {
-    PADDLE_ENFORCE_GT(_max_size, 0,
-                      platform::errors::InvalidArgument(
-                          "The max size of FFTConfigCache must be great than 0,"
-                          "But received is [%d]",
-                          _max_size));
-
-    map_kkv_iter_t map_it = _cache_map.find(params);
-    // Hit, put to list front
-    if (map_it != _cache_map.end()) {
-      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
-      return map_it->second->second;
-    }
+template <typename Ti, typename To>
+struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
+                                                 normalization, forward);
+  }
+};
 
-    // Miss
-    // remove if needed
-    if (_usage_list.size() >= _max_size) {
-      auto last = _usage_list.end();
-      last--;
-      _cache_map.erase(last->first);
-      _usage_list.pop_back();
+template <typename Ti, typename To>
+struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    if (axes.size() > 1) {
+      const std::vector<int64_t> c2c_dims(axes.begin(), axes.end() - 1);
+      Tensor temp;
+      temp.mutable_data<Ti>(x->dims(), ctx.GetPlace());
+
+      FFTC2CFunctor<platform::CPUDeviceContext, Ti, Ti> c2c_functor;
+      c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward);
+
+      const std::vector<int64_t> new_axes{axes.back()};
+      exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, &temp, out, new_axes,
+                                                   normalization, forward);
+    } else {
+      exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
+                                                   normalization, forward);
     }
-
-    // construct new plan at list front, then insert into _cache_map
-    _usage_list.emplace_front(std::piecewise_construct,
-                              std::forward_as_tuple(params),
-                              std::forward_as_tuple(params));
-    auto kv_it = _usage_list.begin();
-    _cache_map.emplace(std::piecewise_construct,
-                       std::forward_as_tuple(kv_it->first),
-                       std::forward_as_tuple(kv_it));
-    return kv_it->second;
   }
-
-  void clear() {
-    _cache_map.clear();
-    _usage_list.clear();
+};
+#elif defined(PADDLE_WITH_POCKETFFT)
+
+template <typename T>
+T compute_factor(int64_t size, FFTNormMode normalization) {
+  constexpr auto one = static_cast<T>(1);
+  switch (normalization) {
+    case FFTNormMode::none:
+      return one;
+    case FFTNormMode::by_n:
+      return one / static_cast<T>(size);
+    case FFTNormMode::by_sqrt_n:
+      return one / std::sqrt(static_cast<T>(size));
   }
+  PADDLE_THROW(
+      platform::errors::InvalidArgument("Unsupported normalization type"));
+}
 
-  void resize(int64_t new_size) {
-    _set_max_size(new_size);
-    auto cur_size = _usage_list.size();
-    if (cur_size > _max_size) {
-      auto delete_it = _usage_list.end();
-      for (size_t i = 0; i < cur_size - _max_size; i++) {
-        delete_it--;
-        _cache_map.erase(delete_it->first);
-      }
-      _usage_list.erase(delete_it, _usage_list.end());
+template <typename Ti, typename To>
+struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    using R = typename Ti::value_type;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x->dims();
+    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+    const int64_t data_size = sizeof(C);
+    std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
+                   [&](std::ptrdiff_t s) { return s * data_size; });
+
+    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
+    auto* out_data = reinterpret_cast<C*>(out->data<To>());
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet factor
+    int64_t signal_numel = 1;
+    for (auto i : axes) {
+      signal_numel *= in_sizes[i];
     }
+    R factor = compute_factor<R>(signal_numel, normalization);
+    pocketfft::c2c(in_sizes, in_strides, in_strides, axes_, forward, in_data,
+                   out_data, factor);
   }
+};
 
-  size_t size() const { return _cache_map.size(); }
-
-  size_t max_size() const noexcept { return _max_size; }
+template <typename Ti, typename To>
+struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    using R = Ti;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x->dims();
+    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+    {
+      const int64_t data_size = sizeof(R);
+      std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
 
-  std::mutex mutex;
+    const auto& output_dim = out->dims();
+    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
+    std::vector<std::ptrdiff_t> out_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
+    {
+      const int64_t data_size = sizeof(C);
+      std::transform(out_strides.begin(), out_strides.end(),
+                     out_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
 
- private:
-  // Only sets size and does value check. Does not resize the data structures.
-  void _set_max_size(int64_t new_size) {
-    // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
-    // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
-    // first.
-    PADDLE_ENFORCE_GE(
-        new_size, 0,
-        platform::errors::InvalidArgument(
-            "cuFFT plan cache size must be non-negative, But received is [%d]",
-            new_size));
-    PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM,
-                      platform::errors::InvalidArgument(
-                          "cuFFT plan cache size can not be larger than [%d], "
-                          "But received is [%d]",
-                          CUFFT_MAX_PLAN_NUM, new_size));
-    _max_size = static_cast<size_t>(new_size);
+    const auto* in_data = x->data<R>();
+    auto* out_data = reinterpret_cast<C*>(out->data<To>());
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet normalization factor
+    int64_t signal_numel = 1;
+    for (auto i : axes) {
+      signal_numel *= in_sizes[i];
+    }
+    R factor = compute_factor<R>(signal_numel, normalization);
+    pocketfft::r2c(in_sizes, in_strides, out_strides, axes_, forward, in_data,
+                   out_data, factor);
   }
-
-  std::list<kv_t> _usage_list;
-  map_t _cache_map;
-  size_t _max_size;
 };
 
-static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches;
-static std::mutex plan_caches_mutex;
-
-static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) {
-  std::lock_guard<std::mutex> guard(plan_caches_mutex);
+template <typename Ti, typename To>
+struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
+  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    using R = To;
+    using C = std::complex<R>;
+
+    const auto& input_dim = x->dims();
+    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    std::vector<std::ptrdiff_t> in_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+    {
+      const int64_t data_size = sizeof(C);
+      std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
 
-  if (device_index >= plan_caches.size()) {
-    plan_caches.resize(device_index + 1);
-  }
+    const auto& output_dim = out->dims();
+    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
+    std::vector<std::ptrdiff_t> out_strides =
+        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
+    {
+      const int64_t data_size = sizeof(R);
+      std::transform(out_strides.begin(), out_strides.end(),
+                     out_strides.begin(),
+                     [&](std::ptrdiff_t s) { return s * data_size; });
+    }
 
-  if (!plan_caches[device_index]) {
-    plan_caches[device_index] = std::make_unique<FFTConfigCache>();
+    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
+    auto* out_data = out->data<R>();
+    // pocketfft requires std::vector<size_t>
+    std::vector<size_t> axes_(axes.size());
+    std::copy(axes.begin(), axes.end(), axes_.begin());
+    // compuet normalization factor
+    int64_t signal_numel = 1;
+    for (auto i : axes) {
+      signal_numel *= out_sizes[i];
+    }
+    R factor = compute_factor<R>(signal_numel, normalization);
+    pocketfft::c2r(out_sizes, in_strides, out_strides, axes_, forward, in_data,
+                   out_data, factor);
   }
+};
 
-  return *plan_caches[device_index];
-}
+#endif
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
index db3dc214bfe7a..0270f7e0576c8 100644
--- a/paddle/fluid/operators/spectral_op.cc
+++ b/paddle/fluid/operators/spectral_op.cc
@@ -13,28 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/spectral_op.h"
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-#if defined(PADDLE_WITH_ONEMKL)
-#include "paddle/phi/backends/dynload/mklrt.h"
-#elif defined(PADDLE_WITH_POCKETFFT)
-#include "extern_pocketfft/pocketfft_hdronly.h"
-#endif
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
+#include "paddle/fluid/operators/spectral_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -355,465 +334,6 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) {
       norm));
 }
 
-// FFT Functors
-#if defined(PADDLE_WITH_ONEMKL)
-
-#define MKL_DFTI_CHECK(expr)                                                   \
-  do {                                                                         \
-    MKL_LONG status = (expr);                                                  \
-    if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR))                  \
-      PADDLE_THROW(                                                            \
-          platform::errors::External(phi::dynload::DftiErrorMessage(status))); \
-  } while (0);
-
-namespace {
-
-struct DftiDescriptorDeleter {
-  void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
-    if (handle != nullptr) {
-      MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
-    }
-  }
-};
-
-// A RAII wrapper for MKL_DESCRIPTOR*
-class DftiDescriptor {
- public:
-  void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type,
-            MKL_LONG signal_ndim, MKL_LONG* sizes) {
-    PADDLE_ENFORCE_EQ(desc_.get(), nullptr,
-                      platform::errors::AlreadyExists(
-                          "DftiDescriptor has already been initialized."));
-
-    DFTI_DESCRIPTOR* raw_desc;
-    MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
-        &raw_desc, precision, signal_type, signal_ndim, sizes));
-    desc_.reset(raw_desc);
-  }
-
-  DFTI_DESCRIPTOR* get() const {
-    DFTI_DESCRIPTOR* raw_desc = desc_.get();
-    PADDLE_ENFORCE_NOT_NULL(raw_desc,
-                            platform::errors::PreconditionNotMet(
-                                "DFTI DESCRIPTOR has not been initialized."));
-    return raw_desc;
-  }
-
- private:
-  std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
-};
-
-DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
-                             const framework::proto::VarType::Type& out_dtype,
-                             const framework::DDim& in_strides,
-                             const framework::DDim& out_strides,
-                             const std::vector<int>& signal_sizes,
-                             FFTNormMode normalization, bool forward) {
-  const DFTI_CONFIG_VALUE precision = [&] {
-    switch (in_dtype) {
-      case framework::proto::VarType::FP32:
-        return DFTI_SINGLE;
-      case framework::proto::VarType::COMPLEX64:
-        return DFTI_SINGLE;
-      case framework::proto::VarType::FP64:
-        return DFTI_DOUBLE;
-      case framework::proto::VarType::COMPLEX128:
-        return DFTI_DOUBLE;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Invalid input datatype (%s), input data type should be FP32, "
-            "FP64, COMPLEX64 or COMPLEX128.",
-            framework::DataTypeToString(in_dtype)));
-    }
-  }();
-
-  // C2C, R2C, C2R
-  const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype);
-  const DFTI_CONFIG_VALUE domain =
-      (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;
-
-  DftiDescriptor descriptor;
-  std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
-  const MKL_LONG signal_ndim = fft_sizes.size() - 1;
-  descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
-
-  // placement inplace or not inplace
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT,
-                                            DFTI_NOT_INPLACE));
-
-  // number of transformations
-  const MKL_LONG batch_size = fft_sizes[0];
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-      descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
-
-  // input & output distance
-  const MKL_LONG idist = in_strides[0];
-  const MKL_LONG odist = out_strides[0];
-  MKL_DFTI_CHECK(
-      phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(),
-                                            DFTI_OUTPUT_DISTANCE, odist));
-
-  // input & output stride
-  std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
-  std::vector<MKL_LONG> mkl_out_stride(1 + signal_ndim, 0);
-  for (MKL_LONG i = 1; i <= signal_ndim; i++) {
-    mkl_in_stride[i] = in_strides[i];
-    mkl_out_stride[i] = out_strides[i];
-  }
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-      descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-      descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
-
-  // conjugate even storage
-  if (!(fft_type == FFTTransformType::C2C)) {
-    MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
-        descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
-  }
-
-  MKL_LONG signal_numel =
-      std::accumulate(fft_sizes.cbegin() + 1, fft_sizes.cend(), 1UL,
-                      std::multiplies<MKL_LONG>());
-  if (normalization != FFTNormMode::none) {
-    const double scale =
-        ((normalization == FFTNormMode::by_sqrt_n)
-             ? 1.0 / std::sqrt(static_cast<double>(signal_numel))
-             : 1.0 / static_cast<double>(signal_numel));
-    const auto scale_direction = [&]() {
-      if (fft_type == FFTTransformType::R2C ||
-          (fft_type == FFTTransformType::C2C && forward)) {
-        return DFTI_FORWARD_SCALE;
-      } else {
-        // (fft_type == FFTTransformType::C2R ||
-        //          (fft_type == FFTTransformType::C2C && !forward))
-        return DFTI_BACKWARD_SCALE;
-      }
-    }();
-    MKL_DFTI_CHECK(
-        phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
-  }
-
-  // commit the descriptor
-  MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
-  return descriptor;
-}
-
-// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
-template <typename DeviceContext, typename Ti, typename To>
-void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
-              const std::vector<int64_t>& axes, FFTNormMode normalization,
-              bool forward) {
-  const framework::DDim& in_sizes = x->dims();
-  const int ndim = in_sizes.size();
-  const int signal_ndim = axes.size();
-  const int batch_ndim = ndim - signal_ndim;
-  const framework::DDim& out_sizes = out->dims();
-
-  // make a dim permutation
-  std::vector<int> dim_permute(ndim);
-  std::iota(dim_permute.begin(), dim_permute.end(), 0);
-  std::vector<bool> is_transformed_dim(ndim, false);
-  for (const auto& d : axes) {
-    is_transformed_dim[d] = true;
-  }
-  const auto batch_end =
-      std::partition(dim_permute.begin(), dim_permute.end(),
-                     [&](size_t axis) { return !is_transformed_dim[axis]; });
-  std::copy(axes.cbegin(), axes.cend(), batch_end);
-
-  // transpose input according to that permutation
-  framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute);
-  std::vector<int64_t> transposed_input_shape_ =
-      phi::vectorize(transposed_input_shape);
-  framework::Tensor transposed_input;
-  transposed_input.Resize(transposed_input_shape);
-  const auto place = ctx.GetPlace();
-  transposed_input.mutable_data<Ti>(place);
-  TransCompute<platform::CPUDeviceContext, Ti>(ndim, ctx, *x, &transposed_input,
-                                               dim_permute);
-
-  // make an collapsed input: collapse batch axes for input
-  const int batch_size = std::accumulate(
-      transposed_input_shape.Get(), transposed_input_shape.Get() + batch_ndim,
-      1L, std::multiplies<int64_t>());
-  std::vector<int> collapsed_input_shape_(1 + signal_ndim);
-  collapsed_input_shape_[0] = batch_size;
-  std::copy(transposed_input_shape_.begin() + batch_ndim,
-            transposed_input_shape_.end(), collapsed_input_shape_.begin() + 1);
-  const framework::DDim collapsed_input_shape =
-      phi::make_ddim(collapsed_input_shape_);
-  transposed_input.Resize(collapsed_input_shape);
-  framework::Tensor& collapsed_input = transposed_input;
-
-  // make a collapsed output
-  std::vector<int> collapsed_output_shape_(1 + signal_ndim);
-  collapsed_output_shape_[0] = batch_size;
-  for (int i = 0; i < signal_ndim; i++) {
-    collapsed_output_shape_[1 + i] = out_sizes[axes[i]];
-  }
-  const framework::DDim collapsed_output_shape =
-      phi::make_ddim(collapsed_output_shape_);
-  framework::Tensor collapsed_output;
-  collapsed_output.Resize(collapsed_output_shape);
-  collapsed_output.mutable_data(place, out->type());
-
-  // signal sizes
-  std::vector<int> signal_sizes(1 + signal_ndim);
-  signal_sizes[0] = batch_size;
-  for (int i = 0; i < signal_ndim; i++) {
-    signal_sizes[1 + i] =
-        std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
-  }
-
-  // input & output stride
-  const framework::DDim input_stride = phi::stride(collapsed_input_shape);
-  const framework::DDim output_stride = phi::stride(collapsed_output_shape);
-
-  // make a DFTI_DESCRIPTOR
-  DftiDescriptor desc =
-      _plan_mkl_fft(framework::TransToProtoVarType(x->dtype()),
-                    framework::TransToProtoVarType(out->dtype()), input_stride,
-                    output_stride, signal_sizes, normalization, forward);
-
-  const FFTTransformType fft_type =
-      GetFFTTransformType(framework::TransToProtoVarType(x->dtype()),
-                          framework::TransToProtoVarType(out->type()));
-  if (fft_type == FFTTransformType::C2R && forward) {
-    framework::Tensor collapsed_input_conj(collapsed_input.dtype());
-    collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
-                                          ctx.GetPlace());
-    // conjugate the input
-    platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
-    phi::funcs::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
-                                        collapsed_input.numel(),
-                                        collapsed_input_conj.data<Ti>());
-    for_range(functor);
-    MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
-        desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
-  } else if (fft_type == FFTTransformType::R2C && !forward) {
-    framework::Tensor collapsed_output_conj(collapsed_output.dtype());
-    collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
-                                           ctx.GetPlace());
-    MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
-        desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
-    // conjugate the output
-    platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
-    phi::funcs::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
-                                        collapsed_output.numel(),
-                                        collapsed_output.data<To>());
-    for_range(functor);
-  } else {
-    if (forward) {
-      MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
-          desc.get(), collapsed_input.data(), collapsed_output.data()));
-    } else {
-      MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
-          desc.get(), collapsed_input.data(), collapsed_output.data()));
-    }
-  }
-
-  // resize for the collapsed output
-  framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
-  collapsed_output.Resize(transposed_output_shape);
-  framework::Tensor& transposed_output = collapsed_output;
-
-  // reverse the transposition
-  std::vector<int> reverse_dim_permute(ndim);
-  for (int i = 0; i < ndim; i++) {
-    reverse_dim_permute[dim_permute[i]] = i;
-  }
-  TransCompute<platform::CPUDeviceContext, To>(ndim, ctx, transposed_output,
-                                               out, reverse_dim_permute);
-}
-}  // anonymous namespace
-
-template <typename Ti, typename To>
-struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
-                                                 normalization, forward);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
-                                                 normalization, forward);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    if (axes.size() > 1) {
-      const std::vector<int64_t> c2c_dims(axes.begin(), axes.end() - 1);
-      Tensor temp;
-      temp.mutable_data<Ti>(x->dims(), ctx.GetPlace());
-
-      FFTC2CFunctor<platform::CPUDeviceContext, Ti, Ti> c2c_functor;
-      c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward);
-
-      const std::vector<int64_t> new_axes{axes.back()};
-      exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, &temp, out, new_axes,
-                                                   normalization, forward);
-    } else {
-      exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
-                                                   normalization, forward);
-    }
-  }
-};
-
-#elif defined(PADDLE_WITH_POCKETFFT)
-
-namespace {
-template <typename T>
-T compute_factor(int64_t size, FFTNormMode normalization) {
-  constexpr auto one = static_cast<T>(1);
-  switch (normalization) {
-    case FFTNormMode::none:
-      return one;
-    case FFTNormMode::by_n:
-      return one / static_cast<T>(size);
-    case FFTNormMode::by_sqrt_n:
-      return one / std::sqrt(static_cast<T>(size));
-  }
-  PADDLE_THROW(
-      platform::errors::InvalidArgument("Unsupported normalization type"));
-}
-}  // anonymous namespace
-
-template <typename Ti, typename To>
-struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    using R = typename Ti::value_type;
-    using C = std::complex<R>;
-
-    const auto& input_dim = x->dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
-    std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
-    const int64_t data_size = sizeof(C);
-    std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
-                   [&](std::ptrdiff_t s) { return s * data_size; });
-
-    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
-    auto* out_data = reinterpret_cast<C*>(out->data<To>());
-    // pocketfft requires std::vector<size_t>
-    std::vector<size_t> axes_(axes.size());
-    std::copy(axes.begin(), axes.end(), axes_.begin());
-    // compuet factor
-    int64_t signal_numel = 1;
-    for (auto i : axes) {
-      signal_numel *= in_sizes[i];
-    }
-    R factor = compute_factor<R>(signal_numel, normalization);
-    pocketfft::c2c(in_sizes, in_strides, in_strides, axes_, forward, in_data,
-                   out_data, factor);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    using R = Ti;
-    using C = std::complex<R>;
-
-    const auto& input_dim = x->dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
-    std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
-    {
-      const int64_t data_size = sizeof(R);
-      std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto& output_dim = out->dims();
-    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
-    std::vector<std::ptrdiff_t> out_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
-    {
-      const int64_t data_size = sizeof(C);
-      std::transform(out_strides.begin(), out_strides.end(),
-                     out_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto* in_data = x->data<R>();
-    auto* out_data = reinterpret_cast<C*>(out->data<To>());
-    // pocketfft requires std::vector<size_t>
-    std::vector<size_t> axes_(axes.size());
-    std::copy(axes.begin(), axes.end(), axes_.begin());
-    // compuet normalization factor
-    int64_t signal_numel = 1;
-    for (auto i : axes) {
-      signal_numel *= in_sizes[i];
-    }
-    R factor = compute_factor<R>(signal_numel, normalization);
-    pocketfft::r2c(in_sizes, in_strides, out_strides, axes_, forward, in_data,
-                   out_data, factor);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    using R = To;
-    using C = std::complex<R>;
-
-    const auto& input_dim = x->dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
-    std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
-    {
-      const int64_t data_size = sizeof(C);
-      std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto& output_dim = out->dims();
-    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
-    std::vector<std::ptrdiff_t> out_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
-    {
-      const int64_t data_size = sizeof(R);
-      std::transform(out_strides.begin(), out_strides.end(),
-                     out_strides.begin(),
-                     [&](std::ptrdiff_t s) { return s * data_size; });
-    }
-
-    const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
-    auto* out_data = out->data<R>();
-    // pocketfft requires std::vector<size_t>
-    std::vector<size_t> axes_(axes.size());
-    std::copy(axes.begin(), axes.end(), axes_.begin());
-    // compuet normalization factor
-    int64_t signal_numel = 1;
-    for (auto i : axes) {
-      signal_numel *= out_sizes[i];
-    }
-    R factor = compute_factor<R>(signal_numel, normalization);
-    pocketfft::c2r(out_sizes, in_strides, out_strides, axes_, forward, in_data,
-                   out_data, factor);
-  }
-};
-
-#endif
-
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
index b7b6b5302afd6..b7fb83d9d5cef 100644
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -8,496 +8,9 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
-#include <functional>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <numeric>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <vector>
 
-#include "paddle/fluid/operators/conj_op.h"
-#include "paddle/fluid/operators/spectral_helper.h"
+#include "paddle/fluid/operators/spectral_op.cu.h"
 #include "paddle/fluid/operators/spectral_op.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-namespace {
-
-// Calculates the normalization constant
-double fft_normalization_scale(FFTNormMode normalization,
-                               const std::vector<int64_t>& sizes,
-                               const std::vector<int64_t>& dims) {
-  // auto norm = static_cast<fft_norm_mode>(normalization);
-  if (normalization == FFTNormMode::none) {
-    return static_cast<double>(1.0);
-  }
-
-  int64_t signal_numel = 1;
-  for (auto dim : dims) {
-    signal_numel *= sizes[dim];
-  }
-  const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
-                                 ? std::sqrt(signal_numel)
-                                 : static_cast<double>(signal_numel);
-  return static_cast<double>(1.0 / scale_denom);
-}
-
-template <typename DeviceContext, typename T>
-void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
-                        FFTNormMode normalization,
-                        const std::vector<int64_t>& sizes,
-                        const std::vector<int64_t>& axes) {
-  double scale = fft_normalization_scale(normalization, sizes, axes);
-  if (scale != 1.0) {
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto dev = ctx.eigen_device();
-    EigenScale<Eigen::GpuDevice, T>::Eval(*dev, eigen_out, eigen_in,
-                                          static_cast<T>(scale),
-                                          static_cast<T>(0), false);
-  } else {
-    framework::TensorCopy(*in, ctx.GetPlace(), out);
-  }
-}
-
-#if defined(PADDLE_WITH_CUDA)
-FFTConfigKey create_fft_configkey(const framework::Tensor& input,
-                                  const framework::Tensor& output,
-                                  int signal_ndim) {
-  // Create the transform plan (either from cache or locally)
-  const auto value_type =
-      framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
-          ? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
-          : framework::TransToProtoVarType(input.dtype());
-  auto fft_type =
-      GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
-                          framework::TransToProtoVarType(output.dtype()));
-  // signal sizes
-  std::vector<int64_t> signal_size(signal_ndim + 1);
-
-  signal_size[0] = input.dims()[0];
-  for (int64_t i = 1; i <= signal_ndim; ++i) {
-    auto in_size = input.dims()[i];
-    auto out_size = output.dims()[i];
-    signal_size[i] = std::max(in_size, out_size);
-  }
-  FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
-                   signal_size, fft_type, value_type);
-  return key;
-}
-
-// Execute a pre-planned transform
-static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data,
-                                void* out_data, bool forward) {
-  auto& plan = config.plan();
-
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec(
-      plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
-}
-
-template <typename DeviceContext, typename Ti, typename To>
-void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
-                     framework::Tensor* input, framework::Tensor* output,
-                     bool forward) {
-  // execute transform plan
-  auto fft_type = config.transform_type();
-  if (fft_type == FFTTransformType::C2R && forward) {
-    forward = false;
-    framework::Tensor input_conj(input->type());
-    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
-    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
-    phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
-                                        input_conj.data<Ti>());
-    for_range(functor);
-    exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
-  } else if (fft_type == FFTTransformType::R2C && !forward) {
-    forward = true;
-    framework::Tensor out_conj(output->type());
-    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
-    exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
-
-    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
-    phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
-                                        output->data<To>());
-    for_range(functor);
-  } else {
-    exec_cufft_plan_raw(config, input->data(), output->data(), forward);
-  }
-}
-
-#elif defined(PADDLE_WITH_HIP)
-
-FFTConfigKey create_fft_configkey(const framework::Tensor& input,
-                                  const framework::Tensor& output,
-                                  int signal_ndim) {
-  // Create the transform plan (either from cache or locally)
-  const auto value_type =
-      framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
-          ? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
-          : framework::TransToProtoVarType(input.dtype());
-  auto fft_type =
-      GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
-                          framework::TransToProtoVarType(output.type()));
-  // signal sizes
-  std::vector<int64_t> signal_size(signal_ndim + 1);
-
-  signal_size[0] = input.dims()[0];
-  for (int64_t i = 1; i <= signal_ndim; ++i) {
-    auto in_size = input.dims()[i];
-    auto out_size = output.dims()[i];
-    signal_size[i] = std::max(in_size, out_size);
-  }
-  FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
-                   signal_size, fft_type, value_type);
-  return key;
-}
-
-// Execute a pre-planned transform
-static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
-                                 void* out_data, bool forward) {
-  auto& plan = config.plan();
-
-  auto value_type = config.data_type();
-  if (value_type == framework::proto::VarType::FP32) {
-    switch (config.transform_type()) {
-      case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C(
-            plan, static_cast<hipfftComplex*>(in_data),
-            static_cast<hipfftComplex*>(out_data),
-            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
-        return;
-      }
-      case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C(
-            plan, static_cast<hipfftReal*>(in_data),
-            static_cast<hipfftComplex*>(out_data)));
-        return;
-      }
-      case FFTTransformType::C2R: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R(
-            plan, static_cast<hipfftComplex*>(in_data),
-            static_cast<hipfftReal*>(out_data)));
-        return;
-      }
-    }
-  } else if (value_type == framework::proto::VarType::FP64) {
-    switch (config.transform_type()) {
-      case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z(
-            plan, static_cast<hipfftDoubleComplex*>(in_data),
-            static_cast<hipfftDoubleComplex*>(out_data),
-            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
-        return;
-      }
-      case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z(
-            plan, static_cast<hipfftDoubleReal*>(in_data),
-            static_cast<hipfftDoubleComplex*>(out_data)));
-        return;
-      }
-      case FFTTransformType::C2R: {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D(
-            plan, static_cast<hipfftDoubleComplex*>(in_data),
-            static_cast<hipfftDoubleReal*>(out_data)));
-        return;
-      }
-    }
-  }
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "hipFFT only support transforms of type float32 and float64"));
-}
-
-template <typename DeviceContext, typename Ti, typename To>
-void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
-                      framework::Tensor* input, framework::Tensor* output,
-                      bool forward) {
-  auto fft_type = config.transform_type();
-  if (fft_type == FFTTransformType::C2R && forward) {
-    forward = false;
-    framework::Tensor input_conj(input->type());
-    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
-    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
-    phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
-                                        input_conj.data<Ti>());
-    for_range(functor);
-    exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
-  } else if (fft_type == FFTTransformType::R2C && !forward) {
-    forward = true;
-    framework::Tensor out_conj(output->type());
-    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
-    exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
-
-    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
-    phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
-                                        output->data<To>());
-    for_range(functor);
-  } else {
-    exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
-  }
-}
-
-#endif
-
-// Execute a general unnormalized fft operation (can be c2c, onesided r2c or
-// onesided c2r)
-template <typename DeviceContext, typename Ti, typename To>
-void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
-              const std::vector<int64_t>& dim, bool forward) {
-  const auto x_dims = phi::vectorize(X->dims());
-  const int64_t ndim = static_cast<int64_t>(X->dims().size());
-  auto tensor_place = ctx.GetPlace();
-
-  // make a dim permutation
-  std::vector<int> dim_permute(ndim);
-  std::iota(dim_permute.begin(), dim_permute.end(), int{0});
-  std::vector<bool> is_transformed_dim(ndim);
-  for (const auto& d : dim) {
-    is_transformed_dim[d] = true;
-  }
-  auto batch_end =
-      std::partition(dim_permute.begin(), dim_permute.end(),
-                     [&](int64_t d) { return !is_transformed_dim[d]; });
-  std::sort(dim_permute.begin(), batch_end);
-  std::copy(dim.cbegin(), dim.cend(), batch_end);
-
-  // transpose input according to dim permutation
-  auto transposed_input_shape = X->dims().transpose(dim_permute);
-  framework::Tensor transposed_input;
-  transposed_input.Resize(transposed_input_shape);
-  transposed_input.mutable_data<Ti>(tensor_place);
-  TransCompute<DeviceContext, Ti>(ndim, ctx, *X, &transposed_input,
-                                  dim_permute);
-
-  // Reshape batch dimensions into a single dimension
-  const int64_t signal_ndim = static_cast<int64_t>(dim.size());
-  std::vector<int64_t> collapsed_input_shape(signal_ndim + 1);
-
-  auto transposed_input_shape_ = phi::vectorize(transposed_input_shape);
-  const int64_t batch_dims = ndim - signal_ndim;
-  auto batch_size =
-      std::accumulate(transposed_input_shape_.begin(),
-                      transposed_input_shape_.begin() + batch_dims,
-                      static_cast<int>(1), std::multiplies<int>());
-  collapsed_input_shape[0] = batch_size;
-
-  std::copy(transposed_input_shape_.begin() + batch_dims,
-            transposed_input_shape_.end(), collapsed_input_shape.begin() + 1);
-
-  framework::Tensor& collapsed_input = transposed_input;
-  collapsed_input.Resize(phi::make_ddim(collapsed_input_shape));
-
-  // make a collpased output
-  const auto out_dims = phi::vectorize(out->dims());
-  std::vector<int64_t> collapsed_output_shape(1 + signal_ndim);
-  collapsed_output_shape[0] = batch_size;
-  for (size_t i = 0; i < dim.size(); ++i) {
-    collapsed_output_shape[i + 1] = out_dims[dim[i]];
-  }
-  framework::Tensor collapsed_output;
-  collapsed_output.Resize(phi::make_ddim(collapsed_output_shape));
-  collapsed_output.mutable_data<To>(tensor_place);
-
-  FFTConfig* config = nullptr;
-
-#if defined(PADDLE_WITH_CUDA)
-  std::unique_ptr<FFTConfig> config_ = nullptr;
-  // create plan
-  FFTConfigKey key =
-      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
-  bool using_cache = false;
-#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200)
-  using_cache = true;
-#endif
-
-  if (using_cache) {
-    const int64_t device_id = static_cast<int64_t>(
-        reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
-            ->GetDeviceId());
-    FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
-    std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
-    guard.lock();
-    config = &(plan_cache.lookup(key));
-  } else {
-    config_ = std::make_unique<FFTConfig>(key);
-    config = config_.get();
-  }
-
-  // prepare cufft for execution
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
-  framework::Tensor workspace_tensor;
-  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea(
-      config->plan(), workspace_tensor.data<To>()));
-  // execute transform plan
-  exec_cufft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
-                                         &collapsed_output, forward);
-
-#elif defined(PADDLE_WITH_HIP)
-  // create plan
-  FFTConfigKey key =
-      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
-  const int64_t device_id = static_cast<int64_t>(
-      reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
-          ->GetDeviceId());
-  FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
-  std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
-  guard.lock();
-  config = &(plan_cache.lookup(key));
-
-  // prepare cufft for execution
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
-  framework::Tensor workspace_tensor;
-  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea(
-      config->plan(), workspace_tensor.data<To>()));
-  // execute transform plan
-  exec_hipfft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
-                                          &collapsed_output, forward);
-#endif
-
-  // Inverting output by reshape and transpose to original batch and dimension
-  auto transposed_out_shape = out->dims().transpose(dim_permute);
-
-  collapsed_output.Resize(transposed_out_shape);
-  auto& transposed_output = collapsed_output;
-
-  std::vector<int> reverse_dim_permute(ndim);
-  for (size_t i = 0; i < ndim; i++) {
-    reverse_dim_permute[dim_permute[i]] = i;
-  }
-
-  TransCompute<DeviceContext, To>(ndim, ctx, transposed_output, out,
-                                  reverse_dim_permute);
-}
-
-}  // anonymous namespace
-
-// Use the optimized path to perform single R2C or C2R if transformation dim is
-// supported by cuFFT
-bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
-  // For performance reason, when axes starts with (0, 1), do not use the
-  // optimized path.
-  if (axes.size() > kMaxFFTNdim ||
-      (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
-    return false;
-  } else {
-    return true;
-  }
-}
-
-template <typename Ti, typename To>
-struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
-  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    if (axes.empty()) {
-      framework::TensorCopy(*X, ctx.GetPlace(), out);
-      return;
-    }
-
-    framework::Tensor* p_out = out;
-    std::vector<int64_t> out_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> working_axes(axes.begin(), axes.end());
-    std::vector<int64_t> first_dims;
-    size_t max_dims;
-    framework::Tensor working_tensor;
-    working_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
-    framework::Tensor* p_working_tensor = &working_tensor;
-    framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor);
-
-    while (true) {
-      max_dims =
-          std::min(static_cast<size_t>(kMaxFFTNdim), working_axes.size());
-      first_dims.assign(working_axes.end() - max_dims, working_axes.end());
-
-      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, p_working_tensor,
-                                                    p_out, first_dims, forward);
-      working_axes.resize(working_axes.size() - max_dims);
-      first_dims.clear();
-
-      if (working_axes.empty()) {
-        break;
-      }
-
-      std::swap(p_out, p_working_tensor);
-    }
-    exec_normalization<platform::CUDADeviceContext, To>(
-        ctx, p_out, out, normalization, out_dims, axes);
-  }
-};
-
-template <typename Ti, typename To>
-struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
-  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    std::vector<int64_t> in_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
-
-    if (use_optimized_fft_path(axes)) {
-      framework::Tensor x_copy(X->type());
-      x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
-      framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
-      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &x_copy, out, axes,
-                                                    forward);
-    } else {
-      framework::Tensor temp_tensor;
-      temp_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
-      const std::vector<int64_t> dims(axes.begin(), axes.end() - 1);
-
-      FFTC2CFunctor<platform::CUDADeviceContext, Ti, Ti> c2c_functor;
-      c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward);
-
-      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &temp_tensor, out,
-                                                    {axes.back()}, forward);
-    }
-    exec_normalization<platform::CUDADeviceContext, To>(
-        ctx, out, out, normalization, out_dims, axes);
-  }
-};
-
-// n dimension real to complex FFT use cufft lib
-template <typename Ti, typename To>
-struct FFTR2CFunctor<platform::CUDADeviceContext, Ti, To> {
-  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
-                  Tensor* out, const std::vector<int64_t>& axes,
-                  FFTNormMode normalization, bool forward) {
-    // Step1: R2C transform on the last dimension
-    framework::Tensor* r2c_out = out;
-    const std::vector<int64_t> last_dim{axes.back()};
-    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
-    exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, X, r2c_out, last_dim,
-                                                  forward);
-
-    // Step2: C2C transform on the remaining dimension
-    framework::Tensor c2c_out;
-    if (axes.size() > 1) {
-      c2c_out.mutable_data<To>(out->dims(), ctx.GetPlace());
-      std::vector<int64_t> remain_dim(axes.begin(), axes.end() - 1);
-      FFTC2CFunctor<platform::CUDADeviceContext, To, To> fft_c2c_func;
-      fft_c2c_func(ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none,
-                   forward);
-    }
-
-    const auto in_sizes = phi::vectorize(X->dims());
-    framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out;
-    exec_normalization<platform::CUDADeviceContext, To>(
-        ctx, norm_tensor, out, normalization, in_sizes, axes);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/fluid/operators/spectral_op.cu.h b/paddle/fluid/operators/spectral_op.cu.h
new file mode 100644
index 0000000000000..fdb0e0d284884
--- /dev/null
+++ b/paddle/fluid/operators/spectral_op.cu.h
@@ -0,0 +1,944 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <functional>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/operators/conj_op.h"
+#include "paddle/fluid/operators/spectral_op.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/hipfft.h"
+#endif
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/dynload/cufft.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using ScalarType = framework::proto::VarType::Type;
+const int64_t kMaxFFTNdim = 3;
+const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
+// This struct is used to easily compute hashes of the
+// parameters. It will be the **key** to the plan cache.
+struct FFTConfigKey {
+  // between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3
+  int64_t signal_ndim_;
+  // These include additional batch dimension as well.
+  int64_t sizes_[kMaxDataNdim];
+  int64_t input_shape_[kMaxDataNdim];
+  int64_t output_shape_[kMaxDataNdim];
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+
+  FFTConfigKey() = default;
+
+  FFTConfigKey(const std::vector<int64_t>& in_shape,
+               const std::vector<int64_t>& out_shape,
+               const std::vector<int64_t>& signal_size,
+               FFTTransformType fft_type, ScalarType value_type) {
+    // Padding bits must be zeroed for hashing
+    memset(this, 0, sizeof(*this));
+    signal_ndim_ = signal_size.size() - 1;
+    fft_type_ = fft_type;
+    value_type_ = value_type;
+
+    std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
+    std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
+    std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
+  }
+};
+
+#if defined(PADDLE_WITH_CUDA)
+// An RAII encapsulation of cuFFTHandle
+class CuFFTHandle {
+  ::cufftHandle handle_;
+
+ public:
+  CuFFTHandle() {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_));
+  }
+
+  CuFFTHandle(const CuFFTHandle& other) = delete;
+  CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
+
+  CuFFTHandle(CuFFTHandle&& other) = delete;
+  CuFFTHandle& operator=(CuFFTHandle&& other) = delete;
+
+  ::cufftHandle& get() { return handle_; }
+  const ::cufftHandle& get() const { return handle_; }
+
+  ~CuFFTHandle() {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_));
+  }
+};
+
+using plan_size_type = long long int;  // NOLINT
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. the workspace size needed
+class FFTConfig {
+ public:
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  explicit FFTConfig(const FFTConfigKey& plan_key)
+      : FFTConfig(
+            std::vector<int64_t>(plan_key.sizes_,
+                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
+            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
+
+  // sizes are full signal, including batch size and always two-sided
+  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
+            FFTTransformType fft_type, ScalarType dtype)
+      : fft_type_(fft_type), value_type_(dtype) {
+    // signal sizes (excluding batch dim)
+    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
+
+    // input batch size
+    const auto batch = static_cast<plan_size_type>(sizes[0]);
+    // const int64_t signal_ndim = sizes.size() - 1;
+    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
+                      platform::errors::InvalidArgument(
+                          "The signal_ndim must be equal to sizes.size() - 1,"
+                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
+                          signal_ndim, sizes.size() - 1));
+
+    cudaDataType itype, otype, exec_type;
+    const auto complex_input = has_complex_input(fft_type);
+    const auto complex_output = has_complex_output(fft_type);
+    if (dtype == framework::proto::VarType::FP32) {
+      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
+      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
+      exec_type = CUDA_C_32F;
+    } else if (dtype == framework::proto::VarType::FP64) {
+      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
+      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
+      exec_type = CUDA_C_64F;
+    } else if (dtype == framework::proto::VarType::FP16) {
+      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
+      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
+      exec_type = CUDA_C_16F;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "cuFFT only support transforms of type float16, float32 and "
+          "float64"));
+    }
+
+    // disable auto allocation of workspace to use allocator from the framework
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation(
+        plan(), /* autoAllocate */ 0));
+
+    size_t ws_size_t;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany(
+        plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
+        batch, &ws_size_t, exec_type));
+
+    ws_size = ws_size_t;
+  }
+
+  FFTConfig(const FFTConfig& other) = delete;
+  FFTConfig& operator=(const FFTConfig& other) = delete;
+
+  FFTConfig(FFTConfig&& other) = delete;
+  FFTConfig& operator=(FFTConfig&& other) = delete;
+
+  const cufftHandle& plan() const { return plan_ptr.get(); }
+
+  FFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  size_t workspace_size() const { return ws_size; }
+
+ private:
+  CuFFTHandle plan_ptr;
+  size_t ws_size;
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+
+#elif defined(PADDLE_WITH_HIP)
+// An RAII encapsulation of cuFFTHandle
+class HIPFFTHandle {
+  ::hipfftHandle handle_;
+
+ public:
+  HIPFFTHandle() {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_));
+  }
+
+  HIPFFTHandle(const HIPFFTHandle& other) = delete;
+  HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete;
+
+  HIPFFTHandle(HIPFFTHandle&& other) = delete;
+  HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete;
+
+  ::hipfftHandle& get() { return handle_; }
+  const ::hipfftHandle& get() const { return handle_; }
+
+  ~HIPFFTHandle() {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_));
+  }
+};
+using plan_size_type = int;
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. the workspace size needed
+class FFTConfig {
+ public:
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  explicit FFTConfig(const FFTConfigKey& plan_key)
+      : FFTConfig(
+            std::vector<int64_t>(plan_key.sizes_,
+                                 plan_key.sizes_ + plan_key.signal_ndim_ + 1),
+            plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
+
+  // sizes are full signal, including batch size and always two-sided
+  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
+            FFTTransformType fft_type, ScalarType dtype)
+      : fft_type_(fft_type), value_type_(dtype) {
+    // signal sizes (excluding batch dim)
+    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
+
+    // input batch size
+    const auto batch = static_cast<plan_size_type>(sizes[0]);
+    // const int64_t signal_ndim = sizes.size() - 1;
+    PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
+                      platform::errors::InvalidArgument(
+                          "The signal_ndim must be equal to sizes.size() - 1,"
+                          "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
+                          signal_ndim, sizes.size() - 1));
+
+    hipfftType exec_type = [&] {
+      if (dtype == framework::proto::VarType::FP32) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_C2C;
+          case FFTTransformType::R2C:
+            return HIPFFT_R2C;
+          case FFTTransformType::C2R:
+            return HIPFFT_C2R;
+        }
+      } else if (dtype == framework::proto::VarType::FP64) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return HIPFFT_Z2Z;
+          case FFTTransformType::R2C:
+            return HIPFFT_D2Z;
+          case FFTTransformType::C2R:
+            return HIPFFT_Z2D;
+        }
+      }
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "hipFFT only support transforms of type float32 and float64"));
+    }();
+
+    // disable auto allocation of workspace to use allocator from the framework
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
+        plan(), /* autoAllocate */ 0));
+
+    size_t ws_size_t;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany(
+        plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
+        batch, &ws_size_t));
+
+    ws_size = ws_size_t;
+  }
+
+  const hipfftHandle& plan() const { return plan_ptr.get(); }
+
+  FFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  size_t workspace_size() const { return ws_size; }
+
+ private:
+  HIPFFTHandle plan_ptr;
+  size_t ws_size;
+  FFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+#endif
+
+// Hashing machinery for Key
+// Fowler–Noll–Vo hash function
+// see
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+template <typename Key>
+struct KeyHash {
+  // Key must be a POD because we read out its memory
+  // contenst as char* when hashing
+  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+
+  size_t operator()(const Key& params) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&params);
+    uint32_t value = 0x811C9DC5;
+    for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return static_cast<size_t>(value);
+  }
+};
+
+template <typename Key>
+struct KeyEqual {
+  // Key must be a POD because we read out its memory
+  // contenst as char* when comparing
+  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+
+  bool operator()(const Key& a, const Key& b) const {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
+    return memcmp(ptr1, ptr2, sizeof(Key)) == 0;
+  }
+};
+
+#if CUDA_VERSION < 10000
+// Note that the max plan number for CUDA version < 10 has to be 1023
+// due to a bug that fails on the 1024th plan
+constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
+constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
+#else
+constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
+// The default max cache size chosen for CUDA version > 10 is arbitrary.
+// This number puts a limit on how big of a plan cache should we maintain by
+// default. Users can always configure it via cufft_set_plan_cache_max_size.
+constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
+#endif
+static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
+                  CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
+              "CUFFT_MAX_PLAN_NUM not in size_t range");
+static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
+                  CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
+              "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
+
+// This cache assumes that the mapping from key to value never changes.
+// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
+// value returned from try_emplace_value.
+// The contract of using this cache is that try_emplace_value should only be
+// used when the max_size is positive.
+class FFTConfigCache {
+ public:
+  using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
+  using map_t = typename std::unordered_map<
+      std::reference_wrapper<FFTConfigKey>, typename std::list<kv_t>::iterator,
+      KeyHash<FFTConfigKey>, KeyEqual<FFTConfigKey>>;
+  using map_kkv_iter_t = typename map_t::iterator;
+
+  FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
+
+  explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
+
+  FFTConfigCache(const FFTConfigCache& other) = delete;
+  FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
+
+  FFTConfigCache(FFTConfigCache&& other) noexcept
+      : _usage_list(std::move(other._usage_list)),
+        _cache_map(std::move(other._cache_map)),
+        _max_size(other._max_size) {}
+
+  FFTConfigCache& operator=(FFTConfigCache&& other) noexcept {
+    _usage_list = std::move(other._usage_list);
+    _cache_map = std::move(other._cache_map);
+    _max_size = other._max_size;
+    return *this;
+  }
+
+  // If key is in this cache, return the cached config. Otherwise, emplace the
+  // config in this cache and return it.
+  FFTConfig& lookup(FFTConfigKey params) {
+    PADDLE_ENFORCE_GT(_max_size, 0,
+                      platform::errors::InvalidArgument(
+                          "The max size of FFTConfigCache must be great than 0,"
+                          "But received is [%d]",
+                          _max_size));
+
+    map_kkv_iter_t map_it = _cache_map.find(params);
+    // Hit, put to list front
+    if (map_it != _cache_map.end()) {
+      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
+      return map_it->second->second;
+    }
+
+    // Miss
+    // remove if needed
+    if (_usage_list.size() >= _max_size) {
+      auto last = _usage_list.end();
+      last--;
+      _cache_map.erase(last->first);
+      _usage_list.pop_back();
+    }
+
+    // construct new plan at list front, then insert into _cache_map
+    _usage_list.emplace_front(std::piecewise_construct,
+                              std::forward_as_tuple(params),
+                              std::forward_as_tuple(params));
+    auto kv_it = _usage_list.begin();
+    _cache_map.emplace(std::piecewise_construct,
+                       std::forward_as_tuple(kv_it->first),
+                       std::forward_as_tuple(kv_it));
+    return kv_it->second;
+  }
+
+  void clear() {
+    _cache_map.clear();
+    _usage_list.clear();
+  }
+
+  void resize(int64_t new_size) {
+    _set_max_size(new_size);
+    auto cur_size = _usage_list.size();
+    if (cur_size > _max_size) {
+      auto delete_it = _usage_list.end();
+      for (size_t i = 0; i < cur_size - _max_size; i++) {
+        delete_it--;
+        _cache_map.erase(delete_it->first);
+      }
+      _usage_list.erase(delete_it, _usage_list.end());
+    }
+  }
+
+  size_t size() const { return _cache_map.size(); }
+
+  size_t max_size() const noexcept { return _max_size; }
+
+  std::mutex mutex;
+
+ private:
+  // Only sets size and does value check. Does not resize the data structures.
+  void _set_max_size(int64_t new_size) {
+    // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
+    // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
+    // first.
+    PADDLE_ENFORCE_GE(
+        new_size, 0,
+        platform::errors::InvalidArgument(
+            "cuFFT plan cache size must be non-negative, But received is [%d]",
+            new_size));
+    PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM,
+                      platform::errors::InvalidArgument(
+                          "cuFFT plan cache size can not be larger than [%d], "
+                          "But received is [%d]",
+                          CUFFT_MAX_PLAN_NUM, new_size));
+    _max_size = static_cast<size_t>(new_size);
+  }
+
+  std::list<kv_t> _usage_list;
+  map_t _cache_map;
+  size_t _max_size;
+};
+
+static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches;
+static std::mutex plan_caches_mutex;
+
+static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) {
+  std::lock_guard<std::mutex> guard(plan_caches_mutex);
+
+  if (device_index >= plan_caches.size()) {
+    plan_caches.resize(device_index + 1);
+  }
+
+  if (!plan_caches[device_index]) {
+    plan_caches[device_index] = std::make_unique<FFTConfigCache>();
+  }
+
+  return *plan_caches[device_index];
+}
+
+// Calculates the normalization constant
+static double fft_normalization_scale(FFTNormMode normalization,
+                                      const std::vector<int64_t>& sizes,
+                                      const std::vector<int64_t>& dims) {
+  // auto norm = static_cast<fft_norm_mode>(normalization);
+  if (normalization == FFTNormMode::none) {
+    return static_cast<double>(1.0);
+  }
+
+  int64_t signal_numel = 1;
+  for (auto dim : dims) {
+    signal_numel *= sizes[dim];
+  }
+  const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
+                                 ? std::sqrt(signal_numel)
+                                 : static_cast<double>(signal_numel);
+  return static_cast<double>(1.0 / scale_denom);
+}
+
+template <typename DeviceContext, typename T>
+void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
+                        FFTNormMode normalization,
+                        const std::vector<int64_t>& sizes,
+                        const std::vector<int64_t>& axes) {
+  double scale = fft_normalization_scale(normalization, sizes, axes);
+  if (scale != 1.0) {
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto dev = ctx.eigen_device();
+    EigenScale<Eigen::GpuDevice, T>::Eval(*dev, eigen_out, eigen_in,
+                                          static_cast<T>(scale),
+                                          static_cast<T>(0), false);
+  } else {
+    framework::TensorCopy(*in, ctx.GetPlace(), out);
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA)
+static FFTConfigKey create_fft_configkey(const framework::Tensor& input,
+                                         const framework::Tensor& output,
+                                         int signal_ndim) {
+  // Create the transform plan (either from cache or locally)
+  const auto value_type =
+      framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
+          ? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
+          : framework::TransToProtoVarType(input.dtype());
+  auto fft_type =
+      GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
+                          framework::TransToProtoVarType(output.dtype()));
+  // signal sizes
+  std::vector<int64_t> signal_size(signal_ndim + 1);
+
+  signal_size[0] = input.dims()[0];
+  for (int64_t i = 1; i <= signal_ndim; ++i) {
+    auto in_size = input.dims()[i];
+    auto out_size = output.dims()[i];
+    signal_size[i] = std::max(in_size, out_size);
+  }
+  FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
+                   signal_size, fft_type, value_type);
+  return key;
+}
+
+// Execute a pre-planned transform
+static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data,
+                                void* out_data, bool forward) {
+  auto& plan = config.plan();
+
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec(
+      plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
+}
+
+template <typename DeviceContext, typename Ti, typename To>
+void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
+                     framework::Tensor* input, framework::Tensor* output,
+                     bool forward) {
+  // execute transform plan
+  auto fft_type = config.transform_type();
+  if (fft_type == FFTTransformType::C2R && forward) {
+    forward = false;
+    framework::Tensor input_conj(input->type());
+    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
+    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
+    phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
+                                        input_conj.data<Ti>());
+    for_range(functor);
+    exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    forward = true;
+    framework::Tensor out_conj(output->type());
+    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
+    exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
+
+    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
+    phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
+                                        output->data<To>());
+    for_range(functor);
+  } else {
+    exec_cufft_plan_raw(config, input->data(), output->data(), forward);
+  }
+}
+
+#elif defined(PADDLE_WITH_HIP)
+
+static FFTConfigKey create_fft_configkey(const framework::Tensor& input,
+                                         const framework::Tensor& output,
+                                         int signal_ndim) {
+  // Create the transform plan (either from cache or locally)
+  const auto value_type =
+      framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
+          ? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
+          : framework::TransToProtoVarType(input.dtype());
+  auto fft_type =
+      GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
+                          framework::TransToProtoVarType(output.type()));
+  // signal sizes
+  std::vector<int64_t> signal_size(signal_ndim + 1);
+
+  signal_size[0] = input.dims()[0];
+  for (int64_t i = 1; i <= signal_ndim; ++i) {
+    auto in_size = input.dims()[i];
+    auto out_size = output.dims()[i];
+    signal_size[i] = std::max(in_size, out_size);
+  }
+  FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
+                   signal_size, fft_type, value_type);
+  return key;
+}
+
+// Execute a pre-planned transform
+static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
+                                 void* out_data, bool forward) {
+  auto& plan = config.plan();
+
+  auto value_type = config.data_type();
+  if (value_type == framework::proto::VarType::FP32) {
+    switch (config.transform_type()) {
+      case FFTTransformType::C2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C(
+            plan, static_cast<hipfftComplex*>(in_data),
+            static_cast<hipfftComplex*>(out_data),
+            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+        return;
+      }
+      case FFTTransformType::R2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C(
+            plan, static_cast<hipfftReal*>(in_data),
+            static_cast<hipfftComplex*>(out_data)));
+        return;
+      }
+      case FFTTransformType::C2R: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R(
+            plan, static_cast<hipfftComplex*>(in_data),
+            static_cast<hipfftReal*>(out_data)));
+        return;
+      }
+    }
+  } else if (value_type == framework::proto::VarType::FP64) {
+    switch (config.transform_type()) {
+      case FFTTransformType::C2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z(
+            plan, static_cast<hipfftDoubleComplex*>(in_data),
+            static_cast<hipfftDoubleComplex*>(out_data),
+            forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
+        return;
+      }
+      case FFTTransformType::R2C: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z(
+            plan, static_cast<hipfftDoubleReal*>(in_data),
+            static_cast<hipfftDoubleComplex*>(out_data)));
+        return;
+      }
+      case FFTTransformType::C2R: {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D(
+            plan, static_cast<hipfftDoubleComplex*>(in_data),
+            static_cast<hipfftDoubleReal*>(out_data)));
+        return;
+      }
+    }
+  }
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "hipFFT only support transforms of type float32 and float64"));
+}
+
+template <typename DeviceContext, typename Ti, typename To>
+void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
+                      framework::Tensor* input, framework::Tensor* output,
+                      bool forward) {
+  auto fft_type = config.transform_type();
+  if (fft_type == FFTTransformType::C2R && forward) {
+    forward = false;
+    framework::Tensor input_conj(input->type());
+    input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
+    platform::ForRange<DeviceContext> for_range(ctx, input->numel());
+    phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
+                                        input_conj.data<Ti>());
+    for_range(functor);
+    exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
+  } else if (fft_type == FFTTransformType::R2C && !forward) {
+    forward = true;
+    framework::Tensor out_conj(output->type());
+    out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
+    exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
+
+    platform::ForRange<DeviceContext> for_range(ctx, output->numel());
+    phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
+                                        output->data<To>());
+    for_range(functor);
+  } else {
+    exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
+  }
+}
+
+#endif
+
+// Execute a general unnormalized fft operation (can be c2c, onesided r2c or
+// onesided c2r)
+template <typename DeviceContext, typename Ti, typename To>
+void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
+              const std::vector<int64_t>& dim, bool forward) {
+  const auto x_dims = phi::vectorize(X->dims());
+  const int64_t ndim = static_cast<int64_t>(X->dims().size());
+  auto tensor_place = ctx.GetPlace();
+
+  // make a dim permutation
+  std::vector<int> dim_permute(ndim);
+  std::iota(dim_permute.begin(), dim_permute.end(), int{0});
+  std::vector<bool> is_transformed_dim(ndim);
+  for (const auto& d : dim) {
+    is_transformed_dim[d] = true;
+  }
+  auto batch_end =
+      std::partition(dim_permute.begin(), dim_permute.end(),
+                     [&](int64_t d) { return !is_transformed_dim[d]; });
+  std::sort(dim_permute.begin(), batch_end);
+  std::copy(dim.cbegin(), dim.cend(), batch_end);
+
+  // transpose input according to dim permutation
+  auto transposed_input_shape = X->dims().transpose(dim_permute);
+  framework::Tensor transposed_input;
+  transposed_input.Resize(transposed_input_shape);
+  transposed_input.mutable_data<Ti>(tensor_place);
+  TransCompute<DeviceContext, Ti>(ndim, ctx, *X, &transposed_input,
+                                  dim_permute);
+
+  // Reshape batch dimensions into a single dimension
+  const int64_t signal_ndim = static_cast<int64_t>(dim.size());
+  std::vector<int64_t> collapsed_input_shape(signal_ndim + 1);
+
+  auto transposed_input_shape_ = phi::vectorize(transposed_input_shape);
+  const int64_t batch_dims = ndim - signal_ndim;
+  auto batch_size =
+      std::accumulate(transposed_input_shape_.begin(),
+                      transposed_input_shape_.begin() + batch_dims,
+                      static_cast<int>(1), std::multiplies<int>());
+  collapsed_input_shape[0] = batch_size;
+
+  std::copy(transposed_input_shape_.begin() + batch_dims,
+            transposed_input_shape_.end(), collapsed_input_shape.begin() + 1);
+
+  framework::Tensor& collapsed_input = transposed_input;
+  collapsed_input.Resize(phi::make_ddim(collapsed_input_shape));
+
+  // make a collpased output
+  const auto out_dims = phi::vectorize(out->dims());
+  std::vector<int64_t> collapsed_output_shape(1 + signal_ndim);
+  collapsed_output_shape[0] = batch_size;
+  for (size_t i = 0; i < dim.size(); ++i) {
+    collapsed_output_shape[i + 1] = out_dims[dim[i]];
+  }
+  framework::Tensor collapsed_output;
+  collapsed_output.Resize(phi::make_ddim(collapsed_output_shape));
+  collapsed_output.mutable_data<To>(tensor_place);
+
+  FFTConfig* config = nullptr;
+
+#if defined(PADDLE_WITH_CUDA)
+  std::unique_ptr<FFTConfig> config_ = nullptr;
+  // create plan
+  FFTConfigKey key =
+      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
+  bool using_cache = false;
+#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200)
+  using_cache = true;
+#endif
+
+  if (using_cache) {
+    const int64_t device_id = static_cast<int64_t>(
+        reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
+            ->GetDeviceId());
+    FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
+    std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
+    guard.lock();
+    config = &(plan_cache.lookup(key));
+  } else {
+    config_ = std::make_unique<FFTConfig>(key);
+    config = config_.get();
+  }
+
+  // prepare cufft for execution
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
+  framework::Tensor workspace_tensor;
+  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea(
+      config->plan(), workspace_tensor.data<To>()));
+  // execute transform plan
+  exec_cufft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
+                                         &collapsed_output, forward);
+
+#elif defined(PADDLE_WITH_HIP)
+  // create plan
+  FFTConfigKey key =
+      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
+  const int64_t device_id = static_cast<int64_t>(
+      reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
+          ->GetDeviceId());
+  FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
+  std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
+  guard.lock();
+  config = &(plan_cache.lookup(key));
+
+  // prepare cufft for execution
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
+  framework::Tensor workspace_tensor;
+  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea(
+      config->plan(), workspace_tensor.data<To>()));
+  // execute transform plan
+  exec_hipfft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
+                                          &collapsed_output, forward);
+#endif
+
+  // Inverting output by reshape and transpose to original batch and dimension
+  auto transposed_out_shape = out->dims().transpose(dim_permute);
+
+  collapsed_output.Resize(transposed_out_shape);
+  auto& transposed_output = collapsed_output;
+
+  std::vector<int> reverse_dim_permute(ndim);
+  for (size_t i = 0; i < ndim; i++) {
+    reverse_dim_permute[dim_permute[i]] = i;
+  }
+
+  TransCompute<DeviceContext, To>(ndim, ctx, transposed_output, out,
+                                  reverse_dim_permute);
+}
+
+// Use the optimized path to perform single R2C or C2R if transformation dim is
+// supported by cuFFT
+static bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
+  // For performance reason, when axes starts with (0, 1), do not use the
+  // optimized path.
+  if (axes.size() > kMaxFFTNdim ||
+      (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+template <typename Ti, typename To>
+struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
+  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    if (axes.empty()) {
+      framework::TensorCopy(*X, ctx.GetPlace(), out);
+      return;
+    }
+
+    framework::Tensor* p_out = out;
+    std::vector<int64_t> out_dims = phi::vectorize(X->dims());
+    std::vector<int64_t> working_axes(axes.begin(), axes.end());
+    std::vector<int64_t> first_dims;
+    size_t max_dims;
+    framework::Tensor working_tensor;
+    working_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
+    framework::Tensor* p_working_tensor = &working_tensor;
+    framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor);
+
+    while (true) {
+      max_dims =
+          std::min(static_cast<size_t>(kMaxFFTNdim), working_axes.size());
+      first_dims.assign(working_axes.end() - max_dims, working_axes.end());
+
+      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, p_working_tensor,
+                                                    p_out, first_dims, forward);
+      working_axes.resize(working_axes.size() - max_dims);
+      first_dims.clear();
+
+      if (working_axes.empty()) {
+        break;
+      }
+
+      std::swap(p_out, p_working_tensor);
+    }
+    exec_normalization<platform::CUDADeviceContext, To>(
+        ctx, p_out, out, normalization, out_dims, axes);
+  }
+};
+
+template <typename Ti, typename To>
+struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
+  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    std::vector<int64_t> in_dims = phi::vectorize(X->dims());
+    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
+
+    if (use_optimized_fft_path(axes)) {
+      framework::Tensor x_copy(X->type());
+      x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
+      framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
+      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &x_copy, out, axes,
+                                                    forward);
+    } else {
+      framework::Tensor temp_tensor;
+      temp_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
+      const std::vector<int64_t> dims(axes.begin(), axes.end() - 1);
+
+      FFTC2CFunctor<platform::CUDADeviceContext, Ti, Ti> c2c_functor;
+      c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward);
+
+      exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &temp_tensor, out,
+                                                    {axes.back()}, forward);
+    }
+    exec_normalization<platform::CUDADeviceContext, To>(
+        ctx, out, out, normalization, out_dims, axes);
+  }
+};
+
+// n dimension real to complex FFT use cufft lib
+template <typename Ti, typename To>
+struct FFTR2CFunctor<platform::CUDADeviceContext, Ti, To> {
+  void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
+                  Tensor* out, const std::vector<int64_t>& axes,
+                  FFTNormMode normalization, bool forward) {
+    // Step1: R2C transform on the last dimension
+    framework::Tensor* r2c_out = out;
+    const std::vector<int64_t> last_dim{axes.back()};
+    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
+    exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, X, r2c_out, last_dim,
+                                                  forward);
+
+    // Step2: C2C transform on the remaining dimension
+    framework::Tensor c2c_out;
+    if (axes.size() > 1) {
+      c2c_out.mutable_data<To>(out->dims(), ctx.GetPlace());
+      std::vector<int64_t> remain_dim(axes.begin(), axes.end() - 1);
+      FFTC2CFunctor<platform::CUDADeviceContext, To, To> fft_c2c_func;
+      fft_c2c_func(ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none,
+                   forward);
+    }
+
+    const auto in_sizes = phi::vectorize(X->dims());
+    framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out;
+    exec_normalization<platform::CUDADeviceContext, To>(
+        ctx, norm_tensor, out, normalization, in_sizes, axes);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h
index a60ec5a4df52b..71b54caf5ee79 100644
--- a/paddle/fluid/operators/spectral_op.h
+++ b/paddle/fluid/operators/spectral_op.h
@@ -11,8 +11,11 @@
 
 #pragma once
 #define NOMINMAX  // to use std::min std::max correctly on windows
+#include <algorithm>
+#include <functional>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/convert_utils.h"
@@ -23,8 +26,10 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/conj_op.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/padding.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "thrust/device_vector.h"
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index b3403a960a128..ff378396b188f 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -19,7 +19,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -113,13 +115,13 @@ class SqueezeOp : public framework::OperatorWithKernel {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
     //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
     //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
     //                                     framework::DataLayout::kMKLDNN,
     //                                     framework::LibraryType::kMKLDNN);
     //    }
-    //#endif
+    // #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -140,13 +142,13 @@ class SqueezeGradOp : public framework::OperatorWithKernel {
     auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
 
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
     //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
     //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
     //                                     framework::DataLayout::kMKLDNN,
     //                                     framework::LibraryType::kMKLDNN);
     //    }
-    //#endif
+    // #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -201,53 +203,18 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
 class Squeeze2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Squeeze2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Squeeze2");
-
-    const auto &x_dims = ctx->GetInputDim("X");
-    // Check input tensor dims (<6) Eigen limit.
-    PADDLE_ENFORCE_LE(x_dims.size(), 6,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of Input(X) "
-                          "should be in the range of [1, 6] (Eigen limit)."
-                          "But received X's dimensions = %d, X's shape = [%s].",
-                          x_dims.size(), x_dims));
-
-    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
-
-    auto out_dims = GetOutputShape(axes, x_dims, false);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", "Out");
-    }
-
-    if (!ctx->HasOutput("XShape")) return;
-
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", /*->*/ "XShape");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
     //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
     //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
     //                                     framework::DataLayout::kMKLDNN,
     //                                     framework::LibraryType::kMKLDNN);
     //    }
-    //#endif
+    // #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -287,13 +254,13 @@ class Squeeze2GradOp : public framework::OperatorWithKernel {
     auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
 
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
     //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
     //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
     //                                     framework::DataLayout::kMKLDNN,
     //                                     framework::LibraryType::kMKLDNN);
     //    }
-    //#endif
+    // #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -365,6 +332,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(squeeze2, SqueezeInferShapeFunctor,
+                            PD_INFER_META(phi::SqueezeInferMeta));
+
 REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
                   ops::SqueezeGradOpMaker<paddle::framework::OpDesc>,
                   ops::SqueezeGradOpMaker<paddle::imperative::OpBase>);
@@ -376,7 +347,7 @@ REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp,
 REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
                   ops::Squeeze2GradOpMaker<paddle::framework::OpDesc>,
                   ops::Squeeze2GradOpMaker<paddle::imperative::OpBase>,
-                  ops::SqueezeInplaceInferer);
+                  ops::SqueezeInplaceInferer, SqueezeInferShapeFunctor);
 REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
                   ops::Squeeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::Squeeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
@@ -411,34 +382,3 @@ REGISTER_OP_CPU_KERNEL(
                            paddle::platform::complex<double>>,
     ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext,
                            paddle::platform::bfloat16>);
-
-REGISTER_OP_CPU_KERNEL(
-    squeeze2, ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::bfloat16>);
-
-REGISTER_OP_CPU_KERNEL(
-    squeeze2_grad,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::complex<double>>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc
index 8d7c0e5b4ff0e..19aa12cb55e2f 100644
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ b/paddle/fluid/operators/squeeze_op.cu.cc
@@ -46,33 +46,3 @@ REGISTER_OP_CUDA_KERNEL(
                            paddle::platform::complex<float>>,
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext,
                            paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    squeeze2, ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    squeeze2_grad,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
-                            plat::bfloat16>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc
new file mode 100644
index 0000000000000..ecbd9edd87dc6
--- /dev/null
+++ b/paddle/fluid/operators/stft_op.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/stft_op.h"
+#include "paddle/fluid/operators/spectral_helper.h"
+
+namespace paddle {
+namespace operators {
+class StftOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "frame");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "frame");
+
+    const int n_fft = ctx->Attrs().Get<int>("n_fft");
+    const int hop_length = ctx->Attrs().Get<int>("hop_length");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const int x_rank = x_dims.size();
+    const bool onesided = ctx->Attrs().Get<bool>("onesided");
+
+    PADDLE_ENFORCE_EQ(
+        x_rank, 2,
+        platform::errors::InvalidArgument(
+            "Input(X) of StftOp should be a tensor with shape [N, T], "
+            "but got rank %s.",
+            x_rank));
+    PADDLE_ENFORCE_GT(
+        hop_length, 0,
+        platform::errors::InvalidArgument(
+            "Attribute(hop_length) should be greater than 0, but got %s.",
+            hop_length));
+
+    int seq_length = x_dims[x_rank - 1];
+    int n_frames = 1 + (seq_length - n_fft) / hop_length;
+
+    PADDLE_ENFORCE_LE(n_fft, seq_length,
+                      platform::errors::InvalidArgument(
+                          "Attribute(frame_length) should be less equal than "
+                          "sequence length, but got (%s) > (%s).",
+                          n_fft, seq_length));
+
+    std::vector<int64_t> output_shape;
+    output_shape.push_back(x_dims[0]);
+    if (onesided) {
+      output_shape.push_back(n_fft / 2 + 1);
+    } else {
+      output_shape.push_back(n_fft);
+    }
+    output_shape.push_back(n_frames);
+
+    ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(in_dtype, ctx.GetPlace());
+  }
+};
+
+class StftOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input waveforms with shape (N, T)");
+    AddOutput("Out",
+              "The complex STFT output tensor with shape (N, n_fft, "
+              "num_frames) or (N, n_fft/2 + 1, num_frames)");
+    AddAttr<int>("n_fft", "The number of input samples to perform FFT");
+    AddAttr<int>("hop_length", "Number of samples between adjacent frames");
+    AddAttr<bool>("normalized",
+                  "Control whether to scale the output by 1/sqrt(n_fft)");
+    AddAttr<bool>("onesided",
+                  "Control whether to return half of the FFT output");
+    AddComment(R"DOC(
+      Short-time Fourier transform (STFT).
+    )DOC");
+  }
+};
+
+template <typename T>
+class StftGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("stft_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+class StftGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    const auto out_grad_name = framework::GradVarName("Out");
+    OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
+                   "stft_grad");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "stft_grad");
+
+    const auto x_grad_name = framework::GradVarName("X");
+    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
+                   "stft_grad");
+
+    ctx->ShareDim("X", /*->*/ x_grad_name);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    const auto in_dtype = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    const auto kernel_dtype = framework::ToRealType(in_dtype);
+    return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(stft, ops::StftOp, ops::StftOpMaker,
+                  ops::StftGradOpMaker<paddle::framework::OpDesc>,
+                  ops::StftGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(stft_grad, ops::StftGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    stft, ops::StftKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::StftKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    stft_grad, ops::StftGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::StftGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/stft_op.cu
similarity index 53%
rename from paddle/fluid/operators/reduce_ops/reduce_min_op.cu
rename to paddle/fluid/operators/stft_op.cu
index 44548b8d2e778..5272be29c0c14 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/stft_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,13 +11,16 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-// reduce_min
+#include "paddle/fluid/operators/spectral_op.cu.h"
+#include "paddle/fluid/operators/stft_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    stft, ops::StftKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::StftKernel<paddle::platform::CUDADeviceContext, double>);
+
 REGISTER_OP_CUDA_KERNEL(
-    reduce_min,
-    ops::ReduceCudaKernel<float, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MinFunctor, kps::IdentityFunctor>);
+    stft_grad, ops::StftGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::StftGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h
new file mode 100644
index 0000000000000..4f0746ee143f9
--- /dev/null
+++ b/paddle/fluid/operators/stft_op.h
@@ -0,0 +1,157 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+
+#include "paddle/fluid/operators/frame_op.h"
+#include "paddle/fluid/operators/spectral_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class StftKernel : public framework::OpKernel<T> {
+ public:
+  /*
+    Batch Signals (N, T) -> Frames (N, n_fft, num_frames) -> FFTR2C -> (N,
+    n_fft/2 + 1, num_frames) or (N, n_fft, num_frames)
+  */
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using C = paddle::platform::complex<T>;
+    const Tensor* x = ctx.Input<Tensor>("X");
+    Tensor* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<C>(ctx.GetPlace());
+
+    const size_t x_rank = x->dims().size();
+    const size_t out_rank = out->dims().size();
+
+    const int n_fft = ctx.Attr<int>("n_fft");
+    const int hop_length = ctx.Attr<int>("hop_length");
+    const bool normalized = ctx.Attr<bool>("normalized");
+    const bool onesided = ctx.Attr<bool>("onesided");
+
+    const int n_frames = out->dims()[out_rank - 1];
+    const int seq_length = x->dims()[x_rank - 1];
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    std::vector<int64_t> axes = {1};
+
+    // Frame
+    Tensor frames;
+    framework::DDim frames_dims(out->dims());
+    frames_dims.at(axes.back()) = n_fft;
+    frames.mutable_data<T>(frames_dims, ctx.GetPlace());
+    FrameFunctor<DeviceContext, T>()(dev_ctx, x, &frames, seq_length, n_fft,
+                                     n_frames, hop_length, /*is_grad*/ false);
+
+    // FFTR2C
+    FFTNormMode normalization;
+    if (normalized) {
+      normalization = get_norm_from_string("ortho", true);
+    } else {
+      normalization = get_norm_from_string("backward", true);
+    }
+    FFTR2CFunctor<DeviceContext, T, C> fft_r2c_func;
+
+    if (onesided) {
+      fft_r2c_func(dev_ctx, &frames, out, axes, normalization, true);
+    } else {
+      framework::DDim onesided_dims(out->dims());
+      const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1;
+      onesided_dims.at(axes.back()) = onesided_axis_size;
+      Tensor onesided_out;
+      onesided_out.mutable_data<C>(onesided_dims, ctx.GetPlace());
+      fft_r2c_func(dev_ctx, &frames, &onesided_out, axes, normalization, true);
+      fill_conj<DeviceContext, C>(dev_ctx, &onesided_out, out, axes);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class StftGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using C = paddle::platform::complex<T>;
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    const size_t dy_rank = dy->dims().size();
+    const size_t dx_rank = dx->dims().size();
+
+    const int n_fft = ctx.Attr<int>("n_fft");
+    const int hop_length = ctx.Attr<int>("hop_length");
+    const bool normalized = ctx.Attr<bool>("normalized");
+    const bool onesided = ctx.Attr<bool>("onesided");
+    const int n_frames = dy->dims()[dy_rank - 1];
+    const int seq_length = dx->dims()[dx_rank - 1];
+
+    std::vector<int64_t> axes = {1};
+    Tensor d_frames;
+    framework::DDim d_frames_dims(dy->dims());
+    d_frames_dims.at(axes.back()) = n_fft;
+    d_frames.mutable_data<T>(d_frames_dims, ctx.GetPlace());
+
+    Tensor complex_d_frames;
+    complex_d_frames.mutable_data<C>(d_frames_dims, ctx.GetPlace());
+
+    // dy -> d_frames
+    FFTNormMode normalization;
+    if (normalized) {
+      normalization = get_norm_from_string("ortho", true);
+    } else {
+      normalization = get_norm_from_string("backward", true);
+    }
+    FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
+
+    if (!onesided) {
+      fft_c2c_func(dev_ctx, dy, &complex_d_frames, axes, normalization, false);
+    } else {
+      Tensor full_dy;
+      full_dy.mutable_data<C>(d_frames_dims, ctx.GetPlace());
+      auto zero_length = static_cast<int>(full_dy.dims().at(axes.back()) -
+                                          dy->dims().at(axes.back()));
+      auto rank = dy->dims().size();
+
+      std::vector<int> pads(rank * 2, 0);
+      pads[axes.back() * 2 + 1] = zero_length;
+
+      phi::funcs::PaddingFunctor<DeviceContext, C>(
+          rank, ctx.template device_context<DeviceContext>(), pads,
+          static_cast<C>(0), *dy, &full_dy);
+      fft_c2c_func(dev_ctx, &full_dy, &complex_d_frames, axes, normalization,
+                   false);
+    }
+    framework::TransComplexToReal(
+        framework::TransToProtoVarType(d_frames.dtype()),
+        framework::TransToProtoVarType(complex_d_frames.dtype()),
+        complex_d_frames, &d_frames);
+
+    // d_frames -> dx
+    FrameFunctor<DeviceContext, T>()(dev_ctx, &d_frames, dx, seq_length, n_fft,
+                                     n_frames, hop_length, /*is_grad*/ true);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc
index d198992abde7d..0c178b02d0309 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op.cc
@@ -50,6 +50,7 @@ class SyncBatchNormGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
                   ops::BatchNormOpInferVarType,
                   ops::SyncBatchNormGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index d1add111e1d24..0a9ae789b01ee 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include <memory>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,56 +25,6 @@ class TopkV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "topk_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "topk_v2");
-    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "topk_v2");
-
-    auto input_dims = ctx->GetInputDim("X");
-    const int& dim_size = input_dims.size();
-    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
-    PADDLE_ENFORCE_EQ(
-        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
-        paddle::platform::errors::InvalidArgument(
-            "the axis of topk must be [-%d, %d), but you set axis is %d",
-            dim_size, dim_size, axis));
-
-    if (axis < 0) axis += dim_size;
-
-    int k;
-    auto k_is_tensor = ctx->HasInput("K");
-    if (k_is_tensor) {
-      k = -1;
-    } else {
-      k = static_cast<int>(ctx->Attrs().Get<int>("k"));
-      PADDLE_ENFORCE_EQ(k >= 1, true,
-                        paddle::platform::errors::InvalidArgument(
-                            "the attribute of k in the topk must >= 1 or be a "
-                            "Tensor, but received %d .",
-                            k));
-    }
-
-    PADDLE_ENFORCE_GE(input_dims.size(), 1,
-                      paddle::platform::errors::InvalidArgument(
-                          "input of topk must have >= 1d shape"));
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GE(
-          input_dims[axis], k,
-          paddle::platform::errors::InvalidArgument(
-              "input of topk op must have >= %d columns in axis of %d", k,
-              axis));
-    }
-
-    framework::DDim dims = input_dims;
-
-    dims[axis] = k;
-    ctx->SetOutputDim("Out", dims);
-    ctx->SetOutputDim("Indices", dims);
-    ctx->ShareLoD("X", "Out");
-    ctx->ShareLoD("X", "Indices");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -169,8 +121,11 @@ class TopkV2GradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(top_k_v2, TopKInferShapeFunctor,
+                            PD_INFER_META(phi::TopKInferMeta));
 REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker,
                   ops::TopkV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>);
+                  ops::TopkV2GradOpMaker<paddle::imperative::OpBase>,
+                  TopKInferShapeFunctor);
 
 REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad);
diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc
index 3e943c62e1ce1..c8010e8a128e0 100644
--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tril_triu_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
                   ops::TrilTriuGradOpMaker<paddle::framework::OpDesc>,
                   ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
-REGISTER_OP_CPU_KERNEL(
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
-REGISTER_OP_CPU_KERNEL(
-    tril_triu_grad,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext,
-                              plat::float16>);
diff --git a/paddle/fluid/operators/tril_triu_op.cu b/paddle/fluid/operators/tril_triu_op.cu
deleted file mode 100644
index 9cbbdeeb2ce28..0000000000000
--- a/paddle/fluid/operators/tril_triu_op.cu
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/tril_triu_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    tril_triu_grad,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
diff --git a/paddle/fluid/operators/tril_triu_op.h b/paddle/fluid/operators/tril_triu_op.h
deleted file mode 100644
index 3150b7617d10a..0000000000000
--- a/paddle/fluid/operators/tril_triu_op.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class TrilTriuCompute {
- public:
-  HOSTDEVICE TrilTriuCompute(const T* in, const int diagonal, const bool lower,
-                             const int64_t H, const int64_t W, T* out)
-      : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
-
-  HOSTDEVICE void operator()(int64_t idx) {
-    const int64_t row = (idx / W_) % H_;
-    const int64_t col = idx % W_;
-    const bool mask =
-        lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
-    out_[idx] = mask ? static_cast<T>(0) : in_[idx];
-  }
-
- private:
-  const T* in_;
-  const int diagonal_;
-  const bool lower_;
-  const int64_t H_;
-  const int64_t W_;
-  T* out_;
-};
-
-template <typename DeviceContext, typename T>
-class TrilTriuOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* x = context.Input<framework::Tensor>("X");
-    const auto* x_data = x->data<T>();
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-
-    const auto& dims = x->dims();
-    const auto H = dims[dims.size() - 2];
-    const auto W = dims[dims.size() - 1];
-
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(x->numel()));
-
-    paddle::operators::TrilTriuCompute<T> tril_triu_computer(
-        x_data, diagonal, lower, H, W, out_data);
-    for_range(tril_triu_computer);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TrilTriuGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const auto* dout_data = d_out->data<T>();
-    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dx_data = d_x->mutable_data<T>(context.GetPlace());
-
-    const int diagonal = context.Attr<int>("diagonal");
-    const bool lower = context.Attr<bool>("lower");
-
-    const auto& dims = d_out->dims();
-    const auto H = dims[dims.size() - 2];
-    const auto W = dims[dims.size() - 1];
-
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(d_out->numel()));
-
-    paddle::operators::TrilTriuCompute<T> tril_triu_grad_computer(
-        dout_data, diagonal, lower, H, W, dx_data);
-    for_range(tril_triu_grad_computer);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc
index ad1c1814c05cd..4145730357d60 100644
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc
index e36cbcf228cfb..a44ea8ff689b8 100644
--- a/paddle/fluid/operators/tril_triu_op_xpu.cc
+++ b/paddle/fluid/operators/tril_triu_op_xpu.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 6389c5b268013..445e8cd468bf3 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -18,7 +18,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -251,19 +253,6 @@ class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
 class Unsqueeze2Op : public UnsqueezeOp {
  public:
   using UnsqueezeOp::UnsqueezeOp;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    UnsqueezeOp::InferShape(ctx);
-    const auto &x_dims = ctx->GetInputDim("X");
-
-    if (!ctx->HasOutput("XShape")) return;
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", /*->*/ "XShape");
-  }
 };
 
 class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
@@ -339,10 +328,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnsqueezeGradOpNoNeedBufferVarInferer, "X");
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(unsqueeze2, Unsqueeze2InferShapeFunctor,
+                            PD_INFER_META(phi::UnsqueezeInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
                   ops::UnsqueezeGradOpMaker<paddle::framework::OpDesc>,
                   ops::UnsqueezeGradOpMaker<paddle::imperative::OpBase>);
+
 REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
                   ops::UnsqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::UnsqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
@@ -351,7 +344,8 @@ REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
 REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
                   ops::Unsqueeze2GradOpMaker<paddle::framework::OpDesc>,
                   ops::Unsqueeze2GradOpMaker<paddle::imperative::OpBase>,
-                  ops::UnsqueezeInplaceInferer);
+                  Unsqueeze2InferShapeFunctor, ops::UnsqueezeInplaceInferer);
+
 REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
                   ops::Unsqueeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::Unsqueeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
@@ -388,34 +382,3 @@ REGISTER_OP_CPU_KERNEL(
                              paddle::platform::complex<double>>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext,
                              paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze2, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze2_grad,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc
index 2dcc4d2152a5c..f20ddb5c881e4 100644
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cu.cc
@@ -50,37 +50,3 @@ REGISTER_OP_CUDA_KERNEL(
                              paddle::platform::complex<float>>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
                              paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze2,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze2_grad,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              plat::bfloat16>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<double>>);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 04c8a329e5e1a..de09860fd26d5 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -117,7 +117,7 @@ endif()
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
 # seperate init from device_context to avoid cycle dependencies
-cc_library(init SRCS init.cc DEPS device_context custom_kernel)
+cc_library(init SRCS init.cc DEPS device_context custom_kernel context_pool)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index acf914c5087d0..42c949f7fe0f6 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -13,7 +13,7 @@ IF(WITH_IPU)
     "ipu_device.cc"
   )
 
-  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper)
+  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper popdist)
   cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce)
   add_library(paddle_ipu SHARED ${PADDLE_IPU_SRC})
   add_dependencies(paddle_ipu ipu_backend)
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index e0b3b08a2313d..012294d0fff85 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -32,6 +32,7 @@ IpuBackend* IpuBackend::GetInstance() {
 IpuBackend::IpuBackend() {
   compiler_ = std::make_unique<Compiler>();
   executor_ = std::make_unique<Executor>();
+  timer_ = std::make_unique<platform::Timer>();
 }
 
 IpuBackend::~IpuBackend() {
@@ -43,6 +44,7 @@ void IpuBackend::Compile(Graph* graph,
                          const std::vector<std::string>& feed_list,
                          const std::vector<std::string>& fetch_list) {
   VLOG(10) << "enter IpuBackend::Compile";
+  is_compiled_ = false;
   compiler_->Prepare(graph);
   compiler_->InitInputs(feed_list);
   compiler_->LowerConstants(scope_);
@@ -52,31 +54,25 @@ void IpuBackend::Compile(Graph* graph,
   if (ipu_strategy_->is_training) {
     compiler_->LowerOptimizer(scope_);
   }
+  if (!ipu_strategy_->onnx_dump_path.empty()) {
+    SaveModelProto(ipu_strategy_->onnx_dump_path);
+  }
   executor_->SetCompilerResources(compiler_->GetResources());
-
+  executor_->Prepare(compiler_->GetModelProto());
   is_compiled_ = true;
-  // when call compile, means a new graph
-  is_prepared_ = false;
   VLOG(10) << "leave IpuBackend::Compile";
 }
 
 void IpuBackend::Run(const std::vector<const Tensor*>& inputs,
                      const std::vector<Tensor*>& outputs,
                      const framework::ExecutionContext& ctx) {
-  Prepare();
   timer_->Start();
   executor_->Run(inputs, outputs, ctx);
   timer_->Pause();
   VLOG(10) << "[IPU Run]: " << timer_->ElapsedMS() << " (ms)";
 }
 
-void IpuBackend::Prepare() {
-  if (!is_prepared_) {
-    executor_->Prepare(compiler_->GetModelProto());
-    timer_.reset(new platform::Timer());
-    is_prepared_ = true;
-  }
-}
+void IpuBackend::WeightsToHost() { executor_->WeightsToHost(); }
 
 void IpuBackend::Detach() { executor_->Detach(); }
 
@@ -101,12 +97,10 @@ void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
 }
 
 void IpuBackend::SaveModelProto(const std::string& path) {
-  if (ipu_strategy_->is_training && is_prepared_) {
+  if (ipu_strategy_->is_training && is_compiled_) {
     executor_->SaveModelToHost(path);
-  } else if (is_compiled_) {
-    compiler_->SaveModelProtoNoCheck(path);
   } else {
-    LOG(WARNING) << "Model is empty";
+    compiler_->SaveModelProtoNoCheck(path);
   }
 }
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
index 1244192490c16..0578d9face675 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -60,6 +60,9 @@ class IpuBackend {
            const std::vector<Tensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
+  // Sync weights from IPU while training
+  void WeightsToHost();
+
   // detach IPU manually
   void Detach();
 
@@ -76,22 +79,17 @@ class IpuBackend {
   void SaveModelProto(const std::string &path);
 
  private:
-  void Prepare();
-
- private:
-  std::unique_ptr<Compiler> compiler_;
-  std::unique_ptr<Executor> executor_;
-  bool is_compiled_ = false;
-  bool is_prepared_ = false;
-
   // not own
   const Scope *scope_ = nullptr;
   const IpuStrategy *ipu_strategy_ = nullptr;
 
- private:
-  // time record for IpuBackend::Run
+  // own
+  std::unique_ptr<Compiler> compiler_;
+  std::unique_ptr<Executor> executor_;
   std::unique_ptr<platform::Timer> timer_;
 
+  bool is_compiled_ = false;
+
   DISABLE_COPY_AND_ASSIGN(IpuBackend);
 };
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index cdb3f6f9b3e28..1a3e600058b3b 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -18,6 +18,7 @@
 #include <popart/adaptive.hpp>
 #include <popart/optimizer.hpp>
 #include <popart/sgd.hpp>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
@@ -25,13 +26,20 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-popart::AdamMode AdamModeFromStr(const std::string& str) {
+popart::AdamMode AdamModeFromStr(const std::string& str,
+                                 const bool& use_no_bias_optimizer) {
   if (str == "adam") {
-    return popart::AdamMode::Adam;
+    if (!use_no_bias_optimizer)
+      return popart::AdamMode::Adam;
+    else
+      return popart::AdamMode::AdamNoBias;
   } else if (str == "adamax") {
     return popart::AdamMode::AdaMax;
   } else if (str == "lamb") {
-    return popart::AdamMode::Lamb;
+    if (!use_no_bias_optimizer)
+      return popart::AdamMode::Lamb;
+    else
+      return popart::AdamMode::LambNoBias;
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Uknown AdamMode: %s, AdamMode must be one of these values: adam, "
@@ -70,6 +78,17 @@ popart::WeightDecayMode WeightDecayModeFromStr(const std::string& str) {
   }
 }
 
+popart::DataType DataTypeFromStr(const std::string& str) {
+  if (str == "FLOAT") {
+    return popart::DataType::FLOAT;
+  } else if (str == "FLOAT16") {
+    return popart::DataType::FLOAT16;
+  } else {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Unsupported DataType: %s", str));
+  }
+}
+
 template <typename T>
 T GetAttrAllowNull(std::string attr, OpDesc* op_desc) {
   if (op_desc->HasAttr(attr)) {
@@ -122,6 +141,17 @@ void Compiler::Prepare(const Graph* graph) {
   builder_ = popart::Builder::create();
   resources_ = std::make_unique<CompilerResources>();
   graph_helper_ = std::make_unique<GraphHelper>(graph);
+  // Set the flag of set_amp_for_all_
+  for (auto* node : graph_helper_->sorted_ops) {
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    if (op_type == "popart_matmul") {
+      if (op_desc->HasAttr(sAvailMemAttribute)) {
+        set_amp_for_all_ = false;
+        return;
+      }
+    }
+  }
 }
 
 void Compiler::RegisterOpFunc() {
@@ -155,7 +185,9 @@ void Compiler::RegisterOpFunc() {
      auto debug_context = BuildDebugContext(op_desc);         \
      auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1();   \
      auto aiOnnxOpset = builder_->aiOnnxOpset11();            \
+     PushNameScope(op_desc);                                  \
      auto output_ids = OnnxImpl(inputs Args, debug_context);  \
+     PopNameScope(op_desc);                                   \
      SetIpuIndexStage(output_ids, op_desc);                   \
      SetAMPAttributes(output_ids, op_desc);                   \
      SetSerializeAttributes(output_ids, op_desc);             \
@@ -241,7 +273,9 @@ void Compiler::LowerConstants(const Scope* scope) {
       popart::TensorInfo tensor_info(PdDataType2PopartType(tensor->dtype()),
                                      shape);
       const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info));
+      PushNameScope(op_desc);
       popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
+      PopNameScope(op_desc);
       SetIpuIndexStage(result, op_desc);
       resources_->tensors.emplace(tensor_name, result);
     }
@@ -261,6 +295,10 @@ void Compiler::LowerWeights(const Scope* scope) {
           VLOG(10) << "found existed one, skip lowering Weight: " << var_name;
           continue;
         }
+        if (var_name.rfind("learning_rate", 0) == 0) {
+          VLOG(10) << "skip learning_rate_var: " << var_name;
+          continue;
+        }
         VLOG(10) << "lowering weight: " << var_name;
 
         auto var = scope->FindVar(var_name);
@@ -273,10 +311,15 @@ void Compiler::LowerWeights(const Scope* scope) {
           }
           popart::TensorInfo tensor_info(dtype, shape);
           popart::ConstVoidData const_data{tensor.data(), tensor_info};
-          popart::TensorId result =
-              builder_->addInitializedInputTensor(const_data, var_name);
-          resources_->tensors.emplace(var_name, result);
-          resources_->weights.push_back(result);
+          if (!node->outputs.empty()) {
+            auto op_node = node->outputs[0];
+            PushNameScope(op_node->Op());
+            popart::TensorId result =
+                builder_->addInitializedInputTensor(const_data, var_name);
+            PopNameScope(op_node->Op());
+            resources_->tensors.emplace(var_name, result);
+            resources_->weights.push_back(var_name);
+          }
         }
       }
     }
@@ -298,7 +341,10 @@ void Compiler::LowerBody() {
     } else if (op_type == "popart_checkpointoutput") {
       auto inputs = GetOpInputs(op_desc);
       auto outputs = GetOpOutputs(op_desc);
+      PushNameScope(op_desc);
       auto output_ids = builder_->checkpointOutput(inputs);
+      PopNameScope(op_desc);
+      SetIpuIndexStage(output_ids, op_desc);
       InsertTensors(outputs, output_ids);
     } else if (op_type == "popart_custom_op") {
       auto inputs = GetOpInputs(op_desc);
@@ -313,9 +359,11 @@ void Compiler::LowerBody() {
           BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
       VLOG(10) << "Build graph from custom op: " << __op_type;
       auto it = custom_ops_.find(__op_type);
+      PushNameScope(op_desc);
       auto output_ids =
           builder_->customOp(it->second.popart_op, it->second.popart_op.version,
                              inputs, outputs.size(), attributes, debug_context);
+      PopNameScope(op_desc);
       SetIpuIndexStage(output_ids, op_desc);
       InsertTensors(outputs, output_ids);
     } else if (op_type == "popart_printtensor") {
@@ -325,8 +373,10 @@ void Compiler::LowerBody() {
       auto print_gradient =
           BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
       auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
+      PushNameScope(op_desc);
       auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
           inputs, print_gradient, debug_context, title);
+      PopNameScope(op_desc);
       SetIpuIndexStage(output_ids, op_desc);
       InsertTensors(outputs, output_ids);
     } else {
@@ -367,8 +417,31 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         resources_->with_lr_sched = false;
       }
       VLOG(10) << "Set initial lr: " << resources_->lr;
-      auto loss_scaling = ipu_strategy_->loss_scaling;
+
+      // Get the type of optimizer
       auto type = BOOST_GET_CONST(std::string, op_desc->GetAttr("type"));
+      // Set weight decay by tensor names for Lamb
+      auto weight_decay_vars = BOOST_GET_CONST(
+          std::vector<std::string>, op_desc->GetAttr("weight_decay_vars"));
+      auto weight_decay_values = BOOST_GET_CONST(
+          std::vector<float>, op_desc->GetAttr("weight_decay_values"));
+      // Get the maximum permissible value for gradient clipping
+      std::vector<popart::ClipNormSettings> clip_norm_settings = {};
+      if (op_desc->HasAttr("clip_norm")) {
+        auto clip_norm = BOOST_GET_CONST(float, op_desc->GetAttr("clip_norm"));
+        clip_norm_settings.push_back(
+            popart::ClipNormSettings::clipAllWeights(clip_norm));
+        VLOG(10) << "Set the global gradient clipping with the maximum "
+                    "permissible value: "
+                 << clip_norm;
+      }
+
+      // Values from ipu_strategy
+      auto loss_scaling = ipu_strategy_->loss_scaling;
+      auto accl1_type = DataTypeFromStr(ipu_strategy_->accl1_type);
+      auto accl2_type = DataTypeFromStr(ipu_strategy_->accl2_type);
+      auto accl3_type = DataTypeFromStr(ipu_strategy_->accl3_type);
+
       if (type == "sgd") {
         auto weight_decay =
             BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
@@ -376,12 +449,18 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         resources_->optimizer_fn = [=](float lr) {
           return std::make_unique<popart::SGD>(
               popart::OptimizerValue(lr, false),
-              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(weight_decay, false),
               popart::OptimizerValue(momentum, true),
               popart::SGD::getUnsetDampening(),
               popart::SGD::getUnsetVelocityScaling(),
-              popart::OptimizerValue(loss_scaling, true));
+              popart::OptimizerValue(loss_scaling, true), clip_norm_settings);
         };
+        resources_->eval_optimizer = std::make_unique<popart::SGD>(
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(0.0, true), popart::SGD::getUnsetDampening(),
+            popart::SGD::getUnsetVelocityScaling(),
+            popart::OptimizerValue(loss_scaling, true), clip_norm_settings);
       } else if (type == "adam") {
         auto weight_decay =
             BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
@@ -392,22 +471,79 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         VLOG(10) << "set max_weight_norm: " << mwn;
         auto adam_mode_ =
             BOOST_GET_CONST(std::string, op_desc->GetAttr("adam_mode"));
-        auto adam_mode = AdamModeFromStr(adam_mode_);
-        auto weight_decay_mode_ =
-            BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode"));
+        auto adam_mode =
+            AdamModeFromStr(adam_mode_, ipu_strategy_->use_no_bias_optimizer);
+        auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode;
+        if (weight_decay_mode_.empty()) {
+          weight_decay_mode_ = BOOST_GET_CONST(
+              std::string, op_desc->GetAttr("weight_decay_mode"));
+        }
         auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
         resources_->optimizer_fn = [=](float lr) {
-          return std::make_unique<popart::Adam>(
-              popart::OptimizerValue(lr, false),
-              popart::OptimizerValue(weight_decay, true),
-              popart::OptimizerValue(beta1, true),
-              popart::OptimizerValue(beta2, true),
+          if (adam_mode == popart::AdamMode::Lamb ||
+              adam_mode == popart::AdamMode::LambNoBias) {
+            const std::map<std::string, std::pair<float, bool>>
+                optimizer_value = {{"defaultLearningRate", {lr, false}},
+                                   {"defaultBeta1", {beta1, false}},
+                                   {"defaultBeta2", {beta2, false}},
+                                   {"defaultEps", {eps, true}},
+                                   {"lossScaling", {loss_scaling, true}},
+                                   {"defaultMaxWeightNorm", {mwn, true}}};
+            auto optimizer_instance = std::make_unique<popart::Adam>(
+                optimizer_value, adam_mode, weight_decay_mode,
+                popart::DataType::UNDEFINED, accl1_type, accl2_type,
+                clip_norm_settings);
+            for (int i = 0; i < weight_decay_vars.size(); i++) {
+              optimizer_instance->insertSpecific(
+                  weight_decay_vars[i],
+                  {{"weightDecay", {weight_decay_values[i], false}}});
+              VLOG(10) << "Set Tensor " << weight_decay_vars[i]
+                       << " weight decay as " << weight_decay_values[i];
+            }
+            return optimizer_instance;
+          } else {
+            return std::make_unique<popart::Adam>(
+                popart::OptimizerValue(lr, false),
+                popart::OptimizerValue(weight_decay, false),
+                popart::OptimizerValue(beta1, false),
+                popart::OptimizerValue(beta2, false),
+                popart::OptimizerValue(eps, true),
+                popart::OptimizerValue(loss_scaling, true),
+                popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
+                popart::DataType::UNDEFINED, accl1_type, accl2_type,
+                clip_norm_settings);
+          }
+        };
+        if (adam_mode == popart::AdamMode::Lamb ||
+            adam_mode == popart::AdamMode::LambNoBias) {
+          const std::map<std::string, std::pair<float, bool>> optimizer_value =
+              {{"defaultLearningRate", {0.0, false}},
+               {"defaultBeta1", {beta1, false}},
+               {"defaultBeta2", {beta2, false}},
+               {"defaultEps", {eps, true}},
+               {"lossScaling", {loss_scaling, true}},
+               {"defaultMaxWeightNorm", {mwn, true}}};
+          auto eval_optimizer = std::make_unique<popart::Adam>(
+              optimizer_value, adam_mode, weight_decay_mode,
+              popart::DataType::UNDEFINED, popart::DataType::FLOAT,
+              popart::DataType::FLOAT, clip_norm_settings);
+          for (int i = 0; i < weight_decay_vars.size(); i++) {
+            eval_optimizer->insertSpecific(weight_decay_vars[i],
+                                           {{"weightDecay", {0.0, false}}});
+          }
+          resources_->eval_optimizer = std::move(eval_optimizer);
+        } else {
+          resources_->eval_optimizer = std::make_unique<popart::Adam>(
+              popart::OptimizerValue(0.0, false),
+              popart::OptimizerValue(0.0, false),
+              popart::OptimizerValue(beta1, false),
+              popart::OptimizerValue(beta2, false),
               popart::OptimizerValue(eps, true),
               popart::OptimizerValue(loss_scaling, true),
               popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
               popart::DataType::UNDEFINED, popart::DataType::FLOAT,
-              popart::DataType::FLOAT);
-        };
+              popart::DataType::FLOAT, clip_norm_settings);
+        }
       } else if (type == "adaptive") {
         auto alpha = BOOST_GET_CONST(float, op_desc->GetAttr("alpha"));
         auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
@@ -417,21 +553,33 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         auto adaptive_mode_ =
             BOOST_GET_CONST(std::string, op_desc->GetAttr("adaptive_mode"));
         auto adaptive_mode = AdaptiveModeFromStr(adaptive_mode_);
-        auto weight_decay_mode_ =
-            BOOST_GET_CONST(std::string, op_desc->GetAttr("weight_decay_mode"));
+        auto weight_decay_mode_ = ipu_strategy_->weight_decay_mode;
+        if (weight_decay_mode_.empty()) {
+          weight_decay_mode_ = BOOST_GET_CONST(
+              std::string, op_desc->GetAttr("weight_decay_mode"));
+        }
         auto weight_decay_mode = WeightDecayModeFromStr(weight_decay_mode_);
         resources_->optimizer_fn = [=](float lr) {
           return std::make_unique<popart::Adaptive>(
               popart::OptimizerValue(lr, false),
-              popart::OptimizerValue(weight_decay, true),
+              popart::OptimizerValue(weight_decay, false),
               popart::OptimizerValue(alpha, true),
               popart::OptimizerValue(momentum, true),
               popart::OptimizerValue(eps, true),
               popart::OptimizerValue(loss_scaling, true), adaptive_mode,
-              weight_decay_mode, popart::DataType::UNDEFINED,
-              popart::DataType::FLOAT, popart::DataType::FLOAT,
-              popart::DataType::FLOAT);
+              weight_decay_mode, popart::DataType::UNDEFINED, accl1_type,
+              accl2_type, accl3_type);
         };
+        resources_->eval_optimizer = std::make_unique<popart::Adaptive>(
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(0.0, false),
+            popart::OptimizerValue(alpha, true),
+            popart::OptimizerValue(momentum, true),
+            popart::OptimizerValue(eps, true),
+            popart::OptimizerValue(loss_scaling, true), adaptive_mode,
+            weight_decay_mode, popart::DataType::UNDEFINED,
+            popart::DataType::FLOAT, popart::DataType::FLOAT,
+            popart::DataType::UNDEFINED);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "optimizer %s is not implemented", type));
@@ -510,9 +658,32 @@ void Compiler::SetAMPAttributes(const std::string& tensor_id,
                                 const OpDesc* op_desc) {
   VLOG(10) << "enter Compiler::SetAMPAttributes";
   if (op_desc->Type() == "popart_matmul") {
-    auto amp = ipu_strategy_->available_memory_proportion;
-    if (amp > 0.0f && amp <= 1.0) {
-      builder_->setAvailableMemoryProportion(tensor_id, amp);
+    if (set_amp_for_all_) {
+      auto amp = ipu_strategy_->available_memory_proportion;
+      if (amp < 0.0f || amp > 1.0) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "AvailableMemoryProportion %f is invalid, which should be set 0 <= "
+            "amp <= 1",
+            amp));
+      }
+      if (amp > 0.0f) {
+        builder_->setAvailableMemoryProportion(tensor_id, amp);
+      }
+    } else {
+      if (op_desc->HasAttr(sAvailMemAttribute)) {
+        auto amp = BOOST_GET_CONST(float, op_desc->GetAttr(sAvailMemAttribute));
+        if (amp < 0.0f || amp > 1.0) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "AvailableMemoryProportion %f is invalid, which should be set 0 "
+              "<= amp <= 1",
+              amp));
+        }
+        if (amp > 0.0f) {
+          builder_->setAvailableMemoryProportion(tensor_id, amp);
+          VLOG(10) << "set available_memory_proportion for tensor: "
+                   << tensor_id << " as " << amp;
+        }
+      }
     }
   }
   VLOG(10) << "leave Compiler::SetAMPAttributes";
@@ -602,6 +773,29 @@ popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) {
   return popart::DebugContext(op_identify_id);
 }
 
+void Compiler::PushNameScope(const OpDesc* op) {
+  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
+  if (op_namescope == "/") {
+    return;
+  }
+  if (!op_namescope.empty()) {
+    op_namescope.pop_back();
+  }
+  if (!op_namescope.empty()) {
+    op_namescope.erase(op_namescope.begin());
+  }
+  VLOG(10) << "name_scope is: " << op_namescope;
+  builder_->pushNameScope(op_namescope);
+}
+
+void Compiler::PopNameScope(const OpDesc* op) {
+  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
+  if (op_namescope == "/") {
+    return;
+  }
+  builder_->popNameScope();
+}
+
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h
index 5d1e8c2727d8f..2d00970bf1297 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.h
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -50,6 +50,8 @@ struct CompilerResources {
   using OptimizerFn =
       std::function<std::unique_ptr<popart::Optimizer>(float lr)>;
   OptimizerFn optimizer_fn;
+  // The eval mode of optimizer in training
+  std::unique_ptr<popart::Optimizer> eval_optimizer;
 
  public:
   popart::Optimizer *Optimizer() { return optimizer.get(); }
@@ -110,6 +112,7 @@ class Compiler {
   void RegisterOpFunc();
   std::vector<std::string> GetOpInputs(const OpDesc *op);
   const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
+  const std::string GetNameScope(const OpDesc *op);
   popart::DebugContext BuildDebugContext(const OpDesc *op);
 
   void InsertTensors(const std::vector<std::string> &output_names,
@@ -126,6 +129,8 @@ class Compiler {
                               const OpDesc *op_desc);
   void SetSerializeAttributes(const std::string &tensor_id,
                               const OpDesc *op_desc);
+  void PushNameScope(const OpDesc *op);
+  void PopNameScope(const OpDesc *op);
 
  private:
   std::unique_ptr<popart::Builder> builder_;
@@ -137,6 +142,14 @@ class Compiler {
 
   const IpuStrategy *ipu_strategy_ = nullptr;
   std::map<std::string, IpuCustomOpIdentifier> custom_ops_;
+
+  // Used to choose the way to set amp for Ops
+  // If anyone op has the attr sAvailMemAttribute, the
+  // available_memory_proportion from ipu_strategy
+  // will be ignored and the Ops are set by their own sAvailMemAttribute. Else,
+  // all relevant Ops will be set by
+  // the available_memory_proportion from ipu_strategy.
+  bool set_amp_for_all_ = true;
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index c124d58957fe6..649b291244110 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -64,15 +64,10 @@ void Executor::Prepare(const std::string &proto) {
   WeightsFromPaddle();
   VLOG(10) << "Copy weights from paddle to popart...done";
 
-  VLOG(10) << "Copy weights from host to device...";
-  session_->weightsFromHost();
-  VLOG(10) << "Copy weights from host to device...done";
-
-  if (ipu_strategy_->save_init_onnx) {
-    session_->modelToHost("test_init.onnx");
+  if (ipu_strategy_->random_seed != std::numeric_limits<std::uint64_t>::max()) {
+    VLOG(10) << "Setting random seed to: " << ipu_strategy_->random_seed;
+    session_->setRandomSeed(ipu_strategy_->random_seed);
   }
-  // init run step
-  step_ = 0;
 }
 
 void Executor::Run(const std::vector<const Tensor *> &inputs,
@@ -120,11 +115,17 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   VLOG(10) << "Prepared inputs/anchors";
 
   if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched) {
-    VLOG(10) << "Update learning_rate";
-    auto new_lr =
-        GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var);
-    VLOG(10) << "New Lr: " << new_lr;
-    auto *optimizer = compiler_resources_->UpdateOptimizer(new_lr);
+    popart::Optimizer *optimizer;
+    if (ipu_strategy_->runtime_options.enable_eval) {
+      VLOG(10) << "Switch optimizer to eval mode";
+      optimizer = compiler_resources_->eval_optimizer.get();
+    } else {
+      VLOG(10) << "Update learning_rate";
+      auto new_lr =
+          GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var);
+      VLOG(10) << "New Lr: " << new_lr;
+      optimizer = compiler_resources_->UpdateOptimizer(new_lr);
+    }
     auto *session = dynamic_cast<popart::TrainingSession *>(session_.get());
     session->updateOptimizerFromHost(optimizer);
   }
@@ -133,15 +134,13 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   VLOG(10) << "Running...";
   session_->run(stepio);
   VLOG(10) << "Running...done";
+}
 
-  step_++;
-  if (ipu_strategy_->is_training &&
-      step_ % ipu_strategy_->save_per_n_step == 0) {
-    session_->weightsToHost();
+void Executor::WeightsToHost() {
+  if (ipu_strategy_->is_training && session_) {
     WeightsToPaddle();
-    if (ipu_strategy_->save_onnx_checkpoint) {
-      session_->modelToHost("test_last" + std::to_string(step_) + ".onnx");
-    }
+  } else {
+    LOG(WARNING) << "For a non-trainning graph, cannot sync weights from IPU.";
   }
 }
 
@@ -153,6 +152,7 @@ void Executor::AcquireDevice() {
   }
 
   bool use_ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  bool enable_distribution = ipu_strategy_->enable_distribution;
   if (use_ipu_model) {
     std::map<std::string, std::string> deviceOpts{
         {
@@ -162,6 +162,16 @@ void Executor::AcquireDevice() {
     };
     device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
         deviceOpts);
+  } else if (enable_distribution) {
+    auto ipus_per_replica = ipu_strategy_->num_ipus /
+                            ipu_strategy_->popart_options.replicatedGraphCount;
+    auto device_id = popdist_get_device(ipus_per_replica);
+    device_ = popart::DeviceManager::createDeviceManager().acquireDeviceById(
+        device_id);
+    PADDLE_ENFORCE_NOT_NULL(
+        device_, platform::errors::Unavailable(
+                     "Can't attach IPU in distribution, ipu_num = %d.",
+                     RequestIpus(ipu_strategy_->num_ipus)));
   } else {
     device_ =
         popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
@@ -185,28 +195,29 @@ void Executor::SetWeightsIO() {
   auto opt_type = compiler_resources_->optimizer_type;
   VLOG(10) << "SetWeightsIO for " << opt_type;
   auto pre_post_fix = GetOptPrePostfix(opt_type);
-  for (const auto &weight_id : compiler_resources_->weights) {
+  for (const auto &weight_pd : compiler_resources_->weights) {
     for (const auto &pair : pre_post_fix) {
       // pair.first : popart prefix, pair.second : paddle postfix
-      auto popart_var_name = pair.first + weight_id;
-      auto paddle_var_name = weight_id + pair.second;
+      auto weight_pop = compiler_resources_->tensors[weight_pd];
+      auto popart_var = pair.first + weight_pop;
+      auto paddle_var = weight_pd + pair.second;
 
-      if (scope_->FindVar(paddle_var_name) == nullptr) {
+      if (scope_->FindVar(paddle_var) == nullptr) {
         continue;
       }
-
-      if (!session_->hasInfo(popart_var_name)) {
+      if (!session_->hasInfo(popart_var)) {
         continue;
       }
 
-      auto var = scope_->GetVar(paddle_var_name);
+      VLOG(10) << "Connect paddle weight: " << paddle_var
+               << " with popart weight: " << popart_var;
+      auto var = scope_->GetVar(paddle_var);
       auto data_ptr = var->GetMutable<framework::LoDTensor>()->data();
-
-      auto tensor_info = session_->getInfo(popart_var_name);
-      executor_resources_->weights_io.insert(popart_var_name,
+      auto tensor_info = session_->getInfo(popart_var);
+      executor_resources_->weights_io.insert(popart_var,
                                              {data_ptr, tensor_info});
       executor_resources_->weights_and_opt_state.emplace_back(
-          std::make_pair(popart_var_name, paddle_var_name));
+          std::make_pair(popart_var, paddle_var));
     }
   }
 }
@@ -284,6 +295,7 @@ void Executor::ConvertWeights(bool align_to_popart) {
 void Executor::WeightsFromPaddle() {
   ConvertWeights(true);
   session_->writeWeights(executor_resources_->weights_io);
+  session_->weightsFromHost();
 }
 
 // |-----------------------------------------------------|
@@ -297,13 +309,13 @@ void Executor::WeightsFromPaddle() {
 // Paddle -> halfToFloat: cast then save to paddle
 // Popart -> Paddle: copy from paddle to popart
 void Executor::WeightsToPaddle() {
+  session_->weightsToHost();
   session_->readWeights(executor_resources_->weights_io);
   ConvertWeights(false);
 }
 
 void Executor::SaveModelToHost(const std::string &path) {
   if (session_) {
-    session_->weightsToHost();
     WeightsToPaddle();
     session_->modelToHost(path);
   } else {
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h
index b08b94b45ff65..c59e623ab20b0 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.h
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <popart/patterns/patterns.hpp>
 #include <popart/session.hpp>
 #include <popart/tensorinfo.hpp>
+#include <popdist/popdist_poplar.hpp>
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
@@ -36,8 +37,7 @@ struct ExecutorResources {
   // map<tensor_id, paddle_var_ptr>
   popart::WeightsIO weights_io;
   // <popart_var, paddle_var> pairs, include weights and optimizer states
-  std::vector<std::pair<popart::TensorId, popart::TensorId>>
-      weights_and_opt_state;
+  std::vector<std::pair<popart::TensorId, std::string>> weights_and_opt_state;
 };
 
 class Executor {
@@ -53,14 +53,12 @@ class Executor {
            const std::vector<Tensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
+  // sync weights from popart to paddle
+  void WeightsToHost();
+
   // detach IPU
   void Detach();
 
-  void SetWeightsIO();
-  void ConvertWeights(bool align_to_popart);
-  void WeightsFromPaddle();
-  void WeightsToPaddle();
-
   // Scope
   void SetScope(const Scope *scope) { scope_ = scope; }
 
@@ -79,6 +77,10 @@ class Executor {
 
  private:
   void AcquireDevice();
+  void SetWeightsIO();
+  void ConvertWeights(bool);
+  void WeightsFromPaddle();
+  void WeightsToPaddle();
 
  private:
   // not own
@@ -92,8 +94,6 @@ class Executor {
   std::unique_ptr<popart::Session> session_;
   // one OneSession means a graph
   std::unique_ptr<ExecutorResources> executor_resources_;
-
-  int step_ = 0;
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/ipu_names.h b/paddle/fluid/platform/device/ipu/ipu_names.h
index a809a8c6e5bcc..b8a6ceffb5c15 100644
--- a/paddle/fluid/platform/device/ipu/ipu_names.h
+++ b/paddle/fluid/platform/device/ipu/ipu_names.h
@@ -24,6 +24,8 @@ static constexpr const char *sIpuIndexAttr = "ipu_index";
 static constexpr const char *sIpuStageAttr = "ipu_stage";
 static constexpr const char *sMatmulSerializeFactor = "serialize_factor";
 static constexpr const char *sMatmulSerializeMode = "serialize_mode";
+static constexpr const char *sAvailMemAttribute = "__available_memory";
+static constexpr const char *sOpNamescope = "op_namescope";
 static constexpr const char *sOpIdentifyIdAttr = "op_identify_id";
 static constexpr const char *sDebugInfoId = "__debug_info_id";
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index e806b0b30e4e0..6172d4d7dc680 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -62,23 +62,40 @@ IpuStrategy::IpuStrategy() {
                  [&]() { return name; })
 
   ADD_BOOL_OPTION(is_training);
-  ADD_BOOL_OPTION(save_init_onnx);
-  ADD_BOOL_OPTION(save_onnx_checkpoint);
   ADD_BOOL_OPTION(need_avg_shard);
   ADD_BOOL_OPTION(enable_fp16);
+  ADD_BOOL_OPTION(transfer_cast_op);
+  ADD_BOOL_OPTION(use_no_bias_optimizer);
+  ADD_BOOL_OPTION(enable_distribution);
   ADD_UINT64_OPTION(num_ipus);
   ADD_UINT64_OPTION(batches_per_step);
   ADD_UINT64_OPTION(micro_batch_size);
-  ADD_UINT64_OPTION(save_per_n_step);
+  ADD_UINT64_OPTION(random_seed);
   ADD_DOUBLE_OPTION(available_memory_proportion);
   ADD_DOUBLE_OPTION(loss_scaling);
   ADD_DOUBLE_OPTION(max_weight_norm);
+  ADD_STRING_OPTION(accl1_type);
+  ADD_STRING_OPTION(accl2_type);
+  ADD_STRING_OPTION(accl3_type);
+  ADD_STRING_OPTION(onnx_dump_path);
+  ADD_STRING_OPTION(weight_decay_mode);
 
 #undef ADD_STRING_OPTION
 #undef ADD_DOUBLE_OPTION
 #undef ADD_UINT64_OPTION
 #undef ADD_BOOL_OPTION
 
+#define ADD_RUNTIME_BOOL_OPTION(name, aliased_name)                          \
+  RegisterSetter(bool_options, #name,                                        \
+                 [&](bool value) { runtime_options.aliased_name = value; }); \
+  RegisterGetter(options_getter, options_type, #name, "bool", [&]() {        \
+    return std::to_string(runtime_options.aliased_name);                     \
+  })
+
+  ADD_RUNTIME_BOOL_OPTION(runtime_options.enable_eval, enable_eval);
+
+#undef ADD_RUNTIME_BOOL_OPTION
+
 #define ADD_POPART_ENUM_OPTION_ALIAS(name, aliased_name, EnumType)        \
   RegisterSetter(uint64_options, #name, [&](std::uint64_t value) {        \
     PADDLE_ENFORCE_LT(                                                    \
@@ -171,6 +188,7 @@ IpuStrategy::IpuStrategy() {
   ADD_POPART_UINT64_OPTION_ALIAS(merge_var_update_mem_threshold,
                                  mergeVarUpdateMemThreshold);
   ADD_POPART_UINT64_OPTION_ALIAS(loose_threshold_at_peak, looseThresholdAtPeak);
+  ADD_POPART_UINT64_OPTION_ALIAS(replicated_graph_count, replicatedGraphCount);
   ADD_POPART_UINT64_OPTION_ALIAS(accumulation_factor, accumulationFactor);
   ADD_POPART_UINT64_OPTION_ALIAS(swap_limit_scheduler, swapLimitScheduler);
   ADD_POPART_UINT64_OPTION_ALIAS(global_replication_factor,
@@ -462,12 +480,30 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
   } else if (opt == "use_io_tiles_to_store") {
     settings->location.storageTileSet =
         value > 0 ? popart::TileSet::IO : popart::TileSet::Compute;
+  } else if (opt == "sharding_domain_with_all") {
+    settings->location.shardingDomain =
+        popart::CommGroup(popart::CommGroupType::All, value);
+  } else if (opt == "sharding_domain_with_consecutive") {
+    settings->location.shardingDomain =
+        popart::CommGroup(popart::CommGroupType::Consecutive, value);
+  } else if (opt == "sharding_domain_with_orthogonal") {
+    settings->location.shardingDomain =
+        popart::CommGroup(popart::CommGroupType::Orthogonal, value);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Unknown option ' %s' for tensor location: %s", opt, tensor));
   }
 }
 
+void IpuStrategy::SetAccumulateOuterFragmentSettings(
+    const std::uint64_t& schedule, const std::vector<int>& values) {
+  VLOG(10) << "SetAccumulateOuterFragmentSettings schedule:" << schedule;
+  auto schedule_ =
+      static_cast<popart::AccumulateOuterFragmentSchedule>(schedule);
+  popart_options.accumulateOuterFragmentSettings =
+      popart::AccumulateOuterFragmentSettings(schedule_, values);
+}
+
 void IpuStrategy::AddCustomOp(const std::string& paddle_op,
                               const std::string& popart_op,
                               const std::string& domain, int version) {
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 571fb1e163718..786e2419cc0be 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -24,6 +24,11 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
+struct RuntimeOptions {
+  // enable the eval mode in training by switching optimizers.
+  bool enable_eval = false;
+};
+
 class IpuStrategy {
  public:
   IpuStrategy();
@@ -32,19 +37,24 @@ class IpuStrategy {
   // training flag, true for training
   bool is_training = true;
 
-  // save the onnx model lowered by paddle program description
-  bool save_init_onnx = false;
-
-  // save the trained model
-  bool save_onnx_checkpoint = false;
-
   // average sharding, debugging used
   bool need_avg_shard = false;
 
   // flag for fp16, true for pure fp16
   bool enable_fp16 = false;
 
-  // Number ipus total needed, replica * ipu_per_replica
+  // enable transfer cast Op target from fp32 to fp16 in fp16 mode
+  bool transfer_cast_op = true;
+
+  // The mode of Adam/Lamb optimizer
+  // false: The standard Adam/Lamb optimizer
+  // true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART
+  bool use_no_bias_optimizer = false;
+
+  // enable distributed computing for POD128 or POD256
+  bool enable_distribution = false;
+
+  // Number ipus total needed, local_replica * ipu_per_replica
   int num_ipus = 1;
 
   // batches per step
@@ -53,8 +63,8 @@ class IpuStrategy {
   // micro batch-size
   int micro_batch_size = 1;
 
-  // save paddle model per n steps
-  int save_per_n_step = 1;
+  // random seed
+  std::uint64_t random_seed = std::numeric_limits<std::uint64_t>::max();
 
   // TODO(alleng) remove this param
   // available memory proportion, 0.0f for disable
@@ -67,6 +77,29 @@ class IpuStrategy {
   // defaultMaxWeightNorm for adam optimizer
   float max_weight_norm = 65504.0f;
 
+  // file path for dumping compiled model in onnx format
+  std::string onnx_dump_path;
+
+  // Data type to use for tensor that stores first-order momentum optimizer
+  // state. FLOAT or FLOAT16
+  std::string accl1_type = "FLOAT";
+
+  // Data type to use for tensor that stores second-order momentum optimizer
+  // state. FLOAT or FLOAT16
+  std::string accl2_type = "FLOAT";
+
+  // Data type to use for tensor that stores third-order momentum optimizer
+  // state. FLOAT or FLOAT16
+  std::string accl3_type = "FLOAT";
+
+  // WeightDecayMode for setting the optimizer
+  // if set, it will override other settings
+  // value must be one of "decay" or "l2_regularization" or not set
+  std::string weight_decay_mode = "";
+
+  // Runtime Options
+  RuntimeOptions runtime_options;
+
   // popart session option
   popart::SessionOptions popart_options;
 
@@ -86,6 +119,8 @@ class IpuStrategy {
                               const std::string &value);
   void SetTensorLocation(const std::string &tensor, const std::string &option,
                          std::uint64_t value);
+  void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule,
+                                          const std::vector<int> &values);
   void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
                    const std::string &domain, int version);
 
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
index c980bb780cfc0..7d92835534513 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
@@ -34,15 +34,36 @@ Node *logical_not_handler(Graph *graph, Node *node) {
                       {GetOutputVarNode("Out", node)}, {});
 }
 
+Node *logical_or_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_logical_or",
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      {GetOutputVarNode("Out", node)}, {});
+}
+
+Node *logical_and_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_logical_and",
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      {GetOutputVarNode("Out", node)}, {});
+}
+
 Node *greater_than_handler(Graph *graph, Node *node) {
   return CreateBaseOp(graph, node, "popart_greater",
                       {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
                       {GetOutputVarNode("Out", node)}, {});
 }
 
+Node *less_than_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_less",
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      {GetOutputVarNode("Out", node)}, {});
+}
+
 REGISTER_HANDLER(equal, equal_handler);
 REGISTER_HANDLER(logical_not, logical_not_handler);
+REGISTER_HANDLER(logical_or, logical_or_handler);
+REGISTER_HANDLER(logical_and, logical_and_handler);
 REGISTER_HANDLER(greater_than, greater_than_handler);
+REGISTER_HANDLER(less_than, less_than_handler);
 
 }  // namespace
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index d4a14a6d8409f..ba6675f40f400 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -98,6 +98,12 @@ Node *matmul_handler(Graph *graph, Node *node) {
   if (x_rank == 1) {
     perm = std::vector<int64_t>{0};
   } else if (x_rank == 2) {
+    if (!transpose_x && !transpose_y && is_float_equal(alpha, 1.0f)) {
+      return CreateBaseOp(
+          graph, node, "popart_matmul",
+          {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+          node->outputs);
+    }
     return CreateGemm(graph, node,
                       {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
                       node->outputs, transpose_x, transpose_y, alpha);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
index 3ec1999edc4f0..0339097d58790 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -32,30 +32,10 @@ const std::string GenerateOpName() {
 
 const std::string CreateOpIdentifyId(Node *node) {
   // format:
-  //   if has custom op_namescope:
-  //      {op_namescope}/op_type/_gen_*
-  //   else:
-  //     {op_type}/{out_var0}/{out_var1}/.../_gen_*
+  //   op_type/_gen_*
   // this name will be used as op name when exporting onnx model from popart
   auto op_type = node->Name();
-  std::string op_namescope;
-  if (node->Op()->HasAttr("op_namescope")) {
-    op_namescope =
-        BOOST_GET_CONST(std::string, node->Op()->GetAttr("op_namescope"));
-  } else {
-    op_namescope = "/";
-  }
-
-  if (op_namescope != "/") {
-    return {op_namescope + op_type + "/" + GenerateOpName()};
-  } else {
-    std::string op_out = "";
-    for (auto *out_node : node->outputs) {
-      op_out += "/";
-      op_out += out_node->Name();
-    }
-    return {op_type + op_out + "/" + GenerateOpName()};
-  }
+  return {op_type + "/" + GenerateOpName()};
 }
 
 Node *MakeVarNode(Graph *graph, Node *node) {
@@ -122,6 +102,12 @@ Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
   if (node->Op()->HasAttr(sMatmulSerializeMode)) {
     CopyOpAttr(sMatmulSerializeMode, node->Op(), new_node->Op());
   }
+  if (node->Op()->HasAttr(sAvailMemAttribute)) {
+    CopyOpAttr(sAvailMemAttribute, node->Op(), new_node->Op());
+  }
+  if (node->Op()->HasAttr(sOpNamescope)) {
+    CopyOpAttr(sOpNamescope, node->Op(), new_node->Op());
+  }
   {
     new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node));
     new_node->Op()->Flush();
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
index 0919afef4d83a..8bd0794368838 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
@@ -54,10 +54,36 @@ Node *checkpointoutput_handler(Graph *graph, Node *node) {
                       node->outputs);
 }
 
+Node *custom_nll_loss_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto reduction = BOOST_GET_CONST(int, op->GetAttr("reduction"));
+  auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignoreIndex"));
+  auto inputIsLogProbability =
+      BOOST_GET_CONST(bool, op->GetAttr("inputIsLogProbability"));
+  return CreateBaseOp(graph, node, "popart_nllloss_v2", node->inputs,
+                      node->outputs,
+                      {{"reduction", reduction},
+                       {"ignoreIndex", ignoreIndex},
+                       {"inputIsLogProbability", inputIsLogProbability}});
+}
+
+Node *identity_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_identity", node->inputs,
+                      node->outputs);
+}
+
+Node *detach_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_detach_v2", node->inputs,
+                      node->outputs);
+}
+
 REGISTER_HANDLER(custom_op, custom_op_handler);
 REGISTER_HANDLER(print, print_handler);
 REGISTER_HANDLER(popart_optimizer, popart_optimizer_handler);
 REGISTER_HANDLER(checkpointoutput, checkpointoutput_handler);
+REGISTER_HANDLER(custom_nll_loss, custom_nll_loss_handler);
+REGISTER_HANDLER(identity, identity_handler);
+REGISTER_HANDLER(detach, detach_handler);
 
 }  // namespace
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index db429d2f62284..6ccb5441f8375 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -49,6 +49,9 @@ Node *fill_constant_handler(Graph *graph, Node *node) {
     case framework::proto::VarType::INT64:
       value = std::vector<int64_t>(size, value_);
       break;
+    case framework::proto::VarType::BOOL:
+      value = std::vector<bool>(size, value_);
+      break;
     default:
       PADDLE_THROW(
           platform::errors::Unimplemented("fill_constant dtype: %d", dtype_));
@@ -417,6 +420,45 @@ Node *assign_handler(Graph *graph, Node *node) {
                       {GetOutputVarNode("Out", node)}, {});
 }
 
+Node *assign_value_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
+  auto dtype = VarType2OnnxDtype(dtype_);
+  auto dims_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("shape"));
+  std::vector<int64_t> dims(dims_.begin(), dims_.end());
+  Attribute values;
+  std::string value_name;
+  switch (dtype_) {
+    case framework::proto::VarType::BOOL: {
+      value_name = "bool_values";
+      auto vec_int = BOOST_GET_CONST(std::vector<int>, op->GetAttr(value_name));
+      std::vector<bool> vec_bool(vec_int.begin(), vec_int.end());
+      values = vec_bool;
+    } break;
+    case framework::proto::VarType::INT32:
+      value_name = "int32_values";
+      values = BOOST_GET_CONST(std::vector<int>, op->GetAttr(value_name));
+      break;
+    case framework::proto::VarType::FP32:
+      value_name = "fp32_values";
+      values = BOOST_GET_CONST(std::vector<float>, op->GetAttr(value_name));
+      break;
+    case framework::proto::VarType::INT64:
+      value_name = "int64_values";
+      values = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr(value_name));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported data type(code %d) for AssignValue operator, only "
+          "supports bool, int32, float32 and int64.",
+          dtype));
+  }
+  return CreateConst(graph, node, node->inputs, node->outputs,
+                     AttributeMap{
+                         {"value", values}, {"dims", dims}, {"dtype", dtype},
+                     });
+}
+
 Node *fill_any_like_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto value = BOOST_GET_CONST(float, op->GetAttr("value"));
@@ -482,6 +524,41 @@ Node *one_hot_handler(Graph *graph, Node *node) {
   }
 }
 
+Node *one_hot_v2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto depth = BOOST_GET_CONST(int, op->GetAttr("depth"));
+  auto allow_out_of_range =
+      BOOST_GET_CONST(bool, op->GetAttr("allow_out_of_range"));
+  if (allow_out_of_range) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Do not support allow_out_of_range=True"));
+  } else {
+    auto depth_tensor =
+        CreateConst(graph, node, {}, {}, {{"value", std::vector<int>{depth}},
+                                          {"dims", std::vector<int64_t>{1}},
+                                          {"dtype", ONNXDataType::INT32}});
+    Node *value_tensor = nullptr;
+    if (GetOutputVarNode("Out", node)->Var()->GetDataType() ==
+        framework::proto::VarType::FP16) {
+      value_tensor =
+          CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
+                                            {"dims", std::vector<int64_t>{2}},
+                                            {"dtype", ONNXDataType::FLOAT16}});
+    } else {
+      value_tensor =
+          CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
+                                            {"dims", std::vector<int64_t>{2}},
+                                            {"dtype", ONNXDataType::FLOAT}});
+    }
+
+    return CreateBaseOp(graph, node, "popart_onehot",
+                        {GetInputVarNode("X", node), depth_tensor->outputs[0],
+                         value_tensor->outputs[0]},
+                        {GetOutputVarNode("Out", node)},
+                        {{"axis", int64_t{-1}}});
+  }
+}
+
 Node *split_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
@@ -510,10 +587,12 @@ REGISTER_HANDLER(shape, shape_handler);
 REGISTER_HANDLER(slice, slice_handler);
 REGISTER_HANDLER(expand, expand_handler);
 REGISTER_HANDLER(assign, assign_handler);
+REGISTER_HANDLER(assign_value, assign_value_handler);
 REGISTER_HANDLER(fill_any_like, fill_any_like_handler);
 REGISTER_HANDLER(lookup_table_v2, lookup_table_v2_handler);
 REGISTER_HANDLER(split, split_handler);
 REGISTER_HANDLER(one_hot, one_hot_handler);
+REGISTER_HANDLER(one_hot_v2, one_hot_v2_handler);
 
 }  // namespace
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 14f516235a720..57d6c5e119ccf 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -51,16 +51,20 @@ XPUOpMap& get_kl2_ops() {
       {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                    pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                               pOpKernelType(vartype::FP16, XPUPlace())})},
       {"conv2d_transpose_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"conv2d_transpose",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"depthwise_conv2d_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"depthwise_conv2d",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"dropout_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 18ac979b48ef3..5605d326f2cfa 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -159,10 +159,8 @@ inline void EmplaceDeviceContext(
               cuda_ctx,
               platform::errors::InvalidArgument(
                   "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
-          // Note: A trick method to init context, why GetAllocator interface
-          // needs a stream parameter?
           dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
-                                    .GetAllocator(p, cuda_ctx->stream())
+                                    .GetAllocator(p)
                                     .get());
           cuda_ctx->PartialInitWithAllocator();
           dev_ctx->SetGenerator(
@@ -517,10 +515,10 @@ CUDAContext::~CUDAContext() {
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
   phi::GPUContext::PartialInitWithoutAllocator();
   cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
-  workspace_.reset(new phi::DnnWorkspaceHandle(
-      memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(place, phi::GPUContext::stream())
-          .get()));
+  auto& instance = memory::allocation::AllocatorFacade::Instance();
+  instance.SetDefaultStream(place, phi::GPUContext::stream());
+  workspace_.reset(
+      new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get()));
 }
 
 CUDADeviceContext::~CUDADeviceContext() = default;
@@ -618,7 +616,7 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
     // return workspace_.get();
     return phi::DnnWorkspaceHandle(
         memory::allocation::AllocatorFacade::Instance()
-            .GetAllocator(GetPlace(), phi::GPUContext::stream())
+            .GetAllocator(GetPlace())
             .get());
   }
   return phi::GPUContext::cudnn_workspace_handle();
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index e104170ca2495..2c5f24d28c6d6 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -916,6 +916,11 @@ class DeviceContextPool {
 
   size_t size() const { return device_contexts_.size(); }
 
+  const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
+  device_contexts() const {
+    return device_contexts_;
+  }
+
  private:
   static DeviceContextPool* pool;
   std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index ce2dba4db02a0..4001fd744e677 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -559,6 +559,34 @@ inline void GetGroupConvWeightsTz(std::vector<int64_t>& weights_tz,  // NOLINT
   }
 }
 
+inline void RegisterModelLayout(
+    std::vector<std::unique_ptr<framework::OperatorBase>>& ops,
+    const platform::Place& place) {
+  if (platform::is_cpu_place(place)) {
+    auto check_attrib = [](std::unique_ptr<framework::OperatorBase>& op,
+                           const std::string& attrib_name) -> bool {
+      if (op->HasAttr(attrib_name)) {
+        auto data_format = op->Attr<std::string>(attrib_name);
+        platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
+            data_format.compare("NHWC") == 0 ? framework::DataLayout::kNHWC
+                                             : framework::DataLayout::kNCHW);
+        return true;
+      } else {
+        return false;
+      }
+    };
+
+    for (auto& op : ops) {
+      if (check_attrib(op, std::string("data_format"))) {
+        return;
+      }
+      if (check_attrib(op, std::string("data_layout"))) {
+        return;
+      }
+    }
+  }
+}
+
 inline bool HasOpINT8DataType(const paddle::framework::OpDesc* op) {
   return (op->GetAttrIfExists<std::string>("mkldnn_data_type") == "int8" ||
           op->GetAttrIfExists<bool>("use_quantizer"));
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc
index ce2e49a1ccd39..d507153d3f5b4 100644
--- a/paddle/fluid/platform/profiler/cpu_utilization.cc
+++ b/paddle/fluid/platform/profiler/cpu_utilization.cc
@@ -118,8 +118,9 @@ float CpuUtilization::GetCpuUtilization() {
   float busy_time = (system_kernel_time_end - system_kernel_time_start) +
                     (system_user_time_end - system_user_time_start);
   float idle_time = system_idle_time_end - system_idle_time_start;
-  cpu_utilization = busy_time / (busy_time + idle_time);
-
+  if (busy_time + idle_time != 0) {
+    cpu_utilization = busy_time / (busy_time + idle_time);
+  }
 #elif defined(__linux__)
   float busy_time = (system_tms_end_.tms_utime - system_tms_start_.tms_utime) +
                     (system_tms_end_.tms_stime - system_tms_start_.tms_stime) +
@@ -127,7 +128,9 @@ float CpuUtilization::GetCpuUtilization() {
                     (irq_end_ - irq_start_) + (softirq_end_ - softirq_start_) +
                     (steal_end_ - steal_start_);
   float idle_time = (idle_end_ - idle_start_) + (iowait_end_ - iowait_start_);
-  cpu_utilization = busy_time / (busy_time + idle_time);
+  if (busy_time + idle_time != 0) {
+    cpu_utilization = busy_time / (busy_time + idle_time);
+  }
 #else
   LOG(WARNING)
       << "Current System is not supported to get system cpu utilization"
@@ -148,13 +151,16 @@ float CpuUtilization::GetCpuCurProcessUtilization() {
   uint64_t end = FileTimeToUint64(end_);
   float busy_time = (process_kernel_time_end - process_kernel_time_start) +
                     (process_user_time_end - process_user_time_start);
-  cpu_process_utilization = busy_time / (end - start);
-  LOG(INFO) << "Process Utilization = " << cpu_process_utilization << std::endl;
+  if (end - start != 0) {
+    cpu_process_utilization = busy_time / (end - start);
+  }
 #elif defined(__linux__)
   float busy_time =
       (process_tms_end_.tms_utime - process_tms_start_.tms_utime) +
       (process_tms_end_.tms_stime - process_tms_start_.tms_stime);
-  cpu_process_utilization = busy_time / (end_ - start_);
+  if (end_ - start_ != 0) {
+    cpu_process_utilization = busy_time / (end_ - start_);
+  }
 #else
   LOG(WARNING)
       << "Current System is not supported to get process cpu utilization"
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 46cbb3358c6c4..ac46fbed10a20 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -44,6 +44,14 @@ std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) {
   return std::unique_ptr<Profiler>(new Profiler(options));
 }
 
+bool Profiler::IsCuptiSupported() {
+  bool supported = false;
+#ifdef PADDLE_WITH_CUPTI
+  supported = true;
+#endif
+  return supported;
+}
+
 Profiler::Profiler(const ProfilerOptions& options) {
   options_ = options;
   std::bitset<32> trace_switch(options_.trace_switch);
diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h
index f9a8ece050492..d24ee504bc640 100644
--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -43,6 +43,8 @@ class Profiler {
  public:
   static std::unique_ptr<Profiler> Create(const ProfilerOptions& options);
 
+  static bool IsCuptiSupported();
+
   void Prepare();
 
   void Start();
diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc
index b43389866c7a8..de314d298c90e 100644
--- a/paddle/fluid/platform/profiler/utils.cc
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/dynload/cupti.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
index cd56d34384268..b471d6b79833a 100644
--- a/paddle/fluid/platform/profiler/utils.h
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <ctime>
 #include <string>
+#include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
 
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 1df917b8c3594..e89d8d96342e7 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -235,25 +235,13 @@ void BindDistributed(py::module *m) {
            py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_GLOO)
-  py::class_<GlooOptions>(*m, "GlooOptions")
-      .def(py::init<>())
-      .def_readwrite("_device", &GlooOptions::device)
-      .def_static("create", &GlooOptions::create);
-
-  py::class_<GlooStore, std::shared_ptr<GlooStore>>(*m, "GlooStore")
-      .def(py::init(
-               [](const std::shared_ptr<paddle::distributed::TCPStore> &store) {
-                 return std::make_shared<GlooStore>(store);
-               }),
-           py::call_guard<py::gil_scoped_release>());
-
   py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
       *m, "ProcessGroupGloo", ProcessGroup)
-      .def(py::init<const std::shared_ptr<GlooStore> &, int, int,
-                    std::shared_ptr<GlooOptions> &>(),
+      .def(py::init<const std::shared_ptr<paddle::distributed::Store> &, int,
+                    int, std::shared_ptr<GlooOptions> &>(),
            py::call_guard<py::gil_scoped_release>())
-      .def(py::init([](const std::shared_ptr<GlooStore> &store, int rank,
-                       int world_size) {
+      .def(py::init([](const std::shared_ptr<paddle::distributed::Store> &store,
+                       int rank, int world_size) {
              auto opts = GlooOptions::create();
              char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
              if (ifname && strlen(ifname) > 1) {
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 1052f93d32ec3..75f0babdfe85b 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -132,8 +132,7 @@ void InitTensorWithTensor(TensorObject* self,
     self->tensor.set_impl(impl);
     VLOG(4) << "Same place, do ShareDataWith";
   } else {
-    self->tensor.set_impl(
-        src.copy_to(phi::TransToPhiBackend(place), true).impl());
+    self->tensor.set_impl(src.copy_to(place, true).impl());
     VLOG(4) << "Different place, do TensorCopy";
   }
   if (src.get_autograd_meta()) {
@@ -156,8 +155,7 @@ void InitTensorWithFrameworkTensor(TensorObject* self,
   } else {
     auto temp =
         paddle::experimental::Tensor(std::make_shared<phi::DenseTensor>(src));
-    self->tensor.set_impl(
-        temp.copy_to(phi::TransToPhiBackend(place), true).impl());
+    self->tensor.set_impl(temp.copy_to(place, true).impl());
     VLOG(4) << "Different place, do TensorCopy";
   }
   egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index e110432c67d39..f3c48309e69fe 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -122,13 +122,33 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
   EAGER_TRY
   auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
   auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
-  egr::RunBackward(tensors, grad_tensors,
-                   CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
+  egr::Backward(tensors, grad_tensors,
+                CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
   Py_INCREF(Py_None);
   return Py_None;
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* eager_api_run_partial_grad(PyObject* self, PyObject* args,
+                                            PyObject* kwargs) {
+  EAGER_TRY
+  auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
+  auto inputs = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
+  auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 2), 2);
+  auto retain_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
+  auto create_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
+  auto only_inputs = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 5), 5);
+  auto allow_unused = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 6), 6);
+  auto no_grad_vars = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 7), 7);
+
+  std::vector<paddle::experimental::Tensor> result =
+      egr::Grad(tensors, inputs, grad_tensors, retain_graph, create_graph,
+                only_inputs, allow_unused, no_grad_vars);
+  VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad";
+  return ToPyObject(result, true /* return_py_none_if_not_initialize */);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
                                        PyObject* kwargs) {
   EAGER_TRY
@@ -139,7 +159,7 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2);
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
 
-  dst = src.copy_to(phi::TransToPhiBackend(place), blocking);
+  dst = src.copy_to(place, blocking);
   egr::EagerUtils::autograd_meta(&dst)->SetStopGradient(
       egr::EagerUtils::autograd_meta(&(src))->StopGradient());
   egr::EagerUtils::autograd_meta(&dst)->SetPersistable(
@@ -355,6 +375,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
   ins_auto_grad_metas.resize(ctx.InputRange().size());
   VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size();
   outs_auto_grad_metas.resize(ctx.OutputRange().size());
+
   for (size_t i = 0; i < ctx.InputRange().size(); i++) {
     ins_auto_grad_metas[i] =
         egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
@@ -384,11 +405,15 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
     // Prepare Grad outputs
     size_t no_grad_cnt = 0;
     for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& in_tensors =
+          ctx.InputsBetween(ctx.InputRangeAt(i).first,
+                            ctx.InputRangeAt(i).second);
+
       if (slot_map[0].find(i) != slot_map[0].end()) {
-        grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], slot_map[0][i]);
+        grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]);
         grad_node->AddEdges(&ins_auto_grad_metas[i], slot_map[0][i]);
       } else {
-        grad_node->SetGradOutMeta(&ins_auto_grad_metas[i],
+        grad_node->SetGradOutMeta(in_tensors,
                                   ins_auto_grad_metas.size() - 1 - no_grad_cnt);
         grad_node->AddEdges(&ins_auto_grad_metas[i],
                             ins_auto_grad_metas.size() - 1 - no_grad_cnt);
@@ -397,11 +422,14 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
     }
     // Prepare Grad inputs with grad of fwd outputs
     for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& out_tensors =
+          ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first,
+                              ctx.OutputRangeAt(i).second);
+
       egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
       egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
-      grad_node->SetGradInMeta(&(outs_auto_grad_metas[i]), i);
-      egr::EagerUtils::CheckAndRetainGrad(ctx.OutputsBetweeen(
-          ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
+      grad_node->SetGradInMeta(out_tensors, i);
+      egr::EagerUtils::CheckAndRetainGrad(out_tensors);
     }
 
     // Prepare Grad inputs with fwd outputs
@@ -452,6 +480,9 @@ PyMethodDef variable_functions[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"run_partial_grad",
+     (PyCFunction)(void (*)(void))eager_api_run_partial_grad,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy,
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 082ec382c79cd..52a43c4ebe8d8 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -32,10 +33,14 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/slice_utils.h"
+#include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "pybind11/detail/internals.h"
 
 namespace paddle {
 namespace pybind {
@@ -148,12 +153,22 @@ bool PyCheckTensor(PyObject* obj) {
 static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
                                      PyObject* kwargs) {
   EAGER_TRY
-  PADDLE_ENFORCE_EQ(
-      self->tensor.initialized(), true,
-      platform::errors::InvalidArgument(
-          "Tensor data of %s is Empty that indicates we have null tensor for "
-          "now, please check if it has no data and initialize it first.",
-          self->tensor.name()));
+  auto& api = pybind11::detail::npy_api::get();
+  if (!self->tensor.impl()) {
+    Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank];
+    Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank];
+    py_dims[0] = 0;
+    py_strides[0] = 0;
+
+    PyObject* array = api.PyArray_NewFromDescr_(
+        api.PyArray_Type_,
+        api.PyArray_DescrFromType_(pybind11::detail::npy_api::NPY_FLOAT_), 1,
+        py_dims, py_strides, nullptr,
+        pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ |
+            pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
+        nullptr);
+    return array;
+  }
   auto tensor_dims = self->tensor.shape();
   auto numpy_dtype = TensorDtype2NumpyDtype(self->tensor.type());
   auto sizeof_dtype = paddle::framework::DataTypeSize(self->tensor.type());
@@ -165,7 +180,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
     py_strides[i] = sizeof_dtype * numel;
     numel *= py_dims[i];
   }
-  auto& api = pybind11::detail::npy_api::get();
+
   PyObject* array = api.PyArray_NewFromDescr_(
       api.PyArray_Type_, api.PyArray_DescrFromType_(numpy_dtype),
       tensor_dims.size(), py_dims, py_strides, nullptr,
@@ -173,7 +188,11 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
           pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
       nullptr);
 
-  if (self->tensor.is_cpu()) {
+  if (!self->tensor.impl()->initialized()) {
+    return array;
+  }
+
+  if (self->tensor.is_cpu() || self->tensor.is_gpu_pinned()) {
     auto dense_tensor =
         std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
     platform::CPUPlace place;
@@ -182,7 +201,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
                                     pybind11::detail::array_proxy(array)->data),
                          place, dense_tensor->data(), sizeof_dtype * numel);
 #if defined(PADDLE_WITH_CUDA)
-  } else if (self->tensor.is_cuda()) {
+  } else if (self->tensor.is_gpu()) {
     auto dense_tensor =
         std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
 
@@ -211,13 +230,38 @@ static PyObject* tensor_method__is_initialized(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method__is_dense_tensor_hold_allocation(
+    TensorObject* self, PyObject* args, PyObject* kwargs) {
+  EAGER_TRY
+  auto dense_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
+  if (dense_tensor) {
+    return ToPyObject(dense_tensor->IsInitialized());
+  } else {
+    return ToPyObject(false);
+  }
+
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
                                         PyObject* kwargs) {
   EAGER_TRY
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
-  auto cp_tensor =
-      self->tensor.copy_to(phi::TransToPhiBackend(place), blocking);
+  auto cp_tensor = self->tensor.copy_to(place, blocking);
+  egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
+  egr::EagerUtils::autograd_meta(&cp_tensor)
+      ->SetPersistable(
+          egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable());
+  return ToPyObject(cp_tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args,
+                                   PyObject* kwargs) {
+  EAGER_TRY
+  auto cp_tensor = self->tensor.copy_to(phi::CPUPlace(), true);
   egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
   egr::EagerUtils::autograd_meta(&cp_tensor)
       ->SetPersistable(
@@ -264,7 +308,7 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args,
             egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
   }
 
-  self->tensor.copy_(src_tensor, blocking);
+  self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking);
 
   VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
           << self->tensor.name();
@@ -314,23 +358,25 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
     grad = meta->MutableGrad();
   }
 
-  if (grad->is_selected_rows()) {
-    auto selected_rows =
-        std::dynamic_pointer_cast<phi::SelectedRows>(grad->impl());
-    if (selected_rows->mutable_value()->IsInitialized()) {
-      selected_rows->mutable_rows()->clear();
-      selected_rows->mutable_value()->clear();
-    }
-  } else if (grad->is_dense_tensor()) {
-    if (grad->initialized()) {
-      if (set_to_zero) {
-        grad->set_impl(paddle::experimental::zeros_like(*grad).impl());
-      } else {
-        VLOG(4) << "Gradient of " << self->tensor.name()
-                << " is initialized, will be released.";
-        auto dense_tensor =
-            std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl());
-        dense_tensor->MoveMemoryHolder();
+  if (grad->impl()) {
+    if (grad->is_selected_rows()) {
+      auto selected_rows =
+          std::dynamic_pointer_cast<phi::SelectedRows>(grad->impl());
+      if (selected_rows->mutable_value()->IsInitialized()) {
+        selected_rows->mutable_rows()->clear();
+        selected_rows->mutable_value()->clear();
+      }
+    } else if (grad->is_dense_tensor()) {
+      if (grad->initialized()) {
+        if (set_to_zero) {
+          grad->set_impl(paddle::experimental::zeros_like(*grad).impl());
+        } else {
+          VLOG(4) << "Gradient of " << self->tensor.name()
+                  << " is initialized, will be released.";
+          auto dense_tensor =
+              std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl());
+          dense_tensor->MoveMemoryHolder();
+        }
       }
     }
   }
@@ -537,10 +583,13 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
     }
     if (op_type == "slice") {
       out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
-                                   paddle::experimental::Tensor(),
+                                   paddle::experimental::Tensor(), {}, {},
                                    std::move(attrs));
     } else if (op_type == "strided_slice") {
-      out = strided_slice_dygraph_function(self->tensor, attrs);
+      out = strided_slice_dygraph_function(
+          self->tensor, paddle::experimental::Tensor(),
+          paddle::experimental::Tensor(), paddle::experimental::Tensor(), {},
+          {}, {}, attrs);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Slice is only support slice and strided_slice, but we got %s which "
@@ -589,6 +638,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
     auto select_index = paddle::experimental::Tensor(
         egr::Controller::Instance().GenerateUniqueName());
     auto idx_tensor = std::make_shared<phi::DenseTensor>();
+    select_index.set_impl(idx_tensor);
     auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
         egr::Controller::Instance().GetExpectedPlace());
     paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
@@ -602,6 +652,216 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
+                                                     PyObject* args,
+                                                     PyObject* kwargs) {
+  EAGER_TRY
+  VLOG(4) << "Call __setitem_eager_tensor";
+
+  auto self_tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
+
+  PyObject* _index = PyTuple_GET_ITEM(args, 0);
+  PyObject* value_obj = PyTuple_GET_ITEM(args, 1);
+  // NOTE(zhiqiu): PyTuple_Pack increases refcount while PyTuple_New
+  // https://github.com/python/cpython/blob/24b63c695ae0a95b06379eaadace66735abac1e2/Objects/tupleobject.c#L251
+  PyObject* index_ptr =
+      !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
+  DEFINE_PADDLE_SCOPE_GUARD([index_ptr, &_index]() {
+    if (!PyTuple_Check(_index)) {
+      Py_DECREF(index_ptr);
+      VLOG(4) << "Call Py_DECREF";
+    }
+  });
+
+  // TODO(pangyoki) add inplace(BumpInplaceVersion) if need
+
+  // 1. Check argumnets
+  bool parse_index = true;
+
+  // Check whether _index can be parsed.
+  const int size = PyTuple_GET_SIZE(index_ptr);
+  for (int dim = 0; dim < size; ++dim) {
+    PyObject* slice_item = PyTuple_GetItem(index_ptr, dim);
+    if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item) ||
+          slice_item == Py_Ellipsis || slice_item == Py_None)) {
+      parse_index = false;
+      break;
+    }
+  }
+
+  // 2. Call op set_value to speed up if the condition is met,
+  // otherwise call TensorToPyArray.
+  // TODO(liym27): Try not to call TensorToPyArray because it always
+  // copys data to cpu place, which reduces performance.
+  if (parse_index) {
+    std::vector<int> axes, starts, ends, steps, decrease_axes, none_axes,
+        infer_flags, list_select_idxs;
+    // if index is a list, list_select_flag will be true
+    bool list_select_flag = false;
+    ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends, &steps,
+                       &decrease_axes, &none_axes, &infer_flags,
+                       &list_select_idxs, &list_select_flag);
+
+    framework::AttributeMap attrs = {{"axes", axes},
+                                     {"starts", starts},
+                                     {"ends", ends},
+                                     {"steps", steps},
+                                     {"decrease_axes", decrease_axes},
+                                     {"none_axes", none_axes}};
+
+    if (egr::Controller::Instance().HasGrad()) {
+      PADDLE_ENFORCE_EQ(
+          egr::egr_utils_api::IsLeafTensor(self->tensor) &&
+              !egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient(),
+          false, platform::errors::InvalidArgument(
+                     "Leaf Tensor (%s) that doesn't stop gradient can't use "
+                     "inplace strategy.",
+                     self->tensor.name()));
+    }
+
+    paddle::experimental::Tensor value_tensor;
+
+    if (PyCheckTensor(value_obj)) {
+      value_tensor = reinterpret_cast<TensorObject*>(value_obj)->tensor;
+
+      // pass the stop_gradient from value to tensor
+      if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
+          egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
+        egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+      }
+    } else if (py::isinstance<py::array>(value_obj)) {
+      paddle::experimental::Tensor value_tensor_tmp(
+          std::make_shared<phi::DenseTensor>(),
+          egr::Controller::Instance().GenerateUniqueName());
+      py::object value_obj_tmp(py::handle(value_obj), true);
+      py::object value = value_obj_tmp;
+      if (self->tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
+        if (!py::isinstance<py::array_t<float>>(value_obj_tmp)) {
+          value = pybind11::detail::CastNumpyArray<float>(value_obj_tmp);
+        }
+      } else if (self->tensor.dtype() ==
+                 paddle::experimental::DataType::FLOAT64) {
+        if (!py::isinstance<py::array_t<double>>(value_obj_tmp)) {
+          value = pybind11::detail::CastNumpyArray<double>(value_obj_tmp);
+        }
+      } else if (self->tensor.dtype() ==
+                 paddle::experimental::DataType::INT32) {
+        if (!py::isinstance<py::array_t<int32_t>>(value_obj_tmp)) {
+          value = pybind11::detail::CastNumpyArray<int32_t>(value_obj_tmp);
+        }
+      } else if (self->tensor.dtype() ==
+                 paddle::experimental::DataType::INT64) {
+        if (!py::isinstance<py::array_t<int64_t>>(value_obj_tmp)) {
+          value = pybind11::detail::CastNumpyArray<int64_t>(value_obj_tmp);
+        }
+      } else if (self->tensor.dtype() == paddle::experimental::DataType::BOOL) {
+        if (!py::isinstance<py::array_t<bool>>(value_obj_tmp)) {
+          value = pybind11::detail::CastNumpyArray<bool>(value_obj_tmp);
+        }
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "When assign a numpy.np value to a paddle.Tensor, "
+            "the data type of the paddle.Tensor must be bool, "
+            "float32, int32 or int64, "
+            "please check the type of tensor."));
+      }
+
+      if (value_tensor_tmp.place() == paddle::PlaceType::kUNK) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        SetTensorFromPyArray(
+            static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
+            value, platform::Place(platform::CUDAPlace(0)), false);
+#else
+        SetTensorFromPyArray(
+            static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
+            value, platform::Place(platform::CPUPlace()), false);
+#endif
+      } else {
+        SetTensorFromPyArray(
+            static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
+            value, value_tensor_tmp.inner_place(), false);
+      }
+
+      value_tensor = value_tensor_tmp;
+    } else {
+      py::object value_obj_tmp(py::handle(value_obj), true);
+      // convert the value to self data type
+      if (py::isinstance<py::float_>(value_obj_tmp) ||
+          py::isinstance<py::int_>(value_obj_tmp) ||
+          py::isinstance<py::bool_>(value_obj_tmp)) {
+        if (self->tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
+          attrs["fp32_values"] =
+              std::vector<float>{value_obj_tmp.cast<float>()};
+        } else if (self->tensor.dtype() ==
+                   paddle::experimental::DataType::FLOAT64) {
+          attrs["fp64_values"] =
+              std::vector<double>{value_obj_tmp.cast<double>()};
+        } else if (self->tensor.dtype() ==
+                   paddle::experimental::DataType::INT32) {
+          attrs["int32_values"] =
+              std::vector<int32_t>{value_obj_tmp.cast<int32_t>()};
+        } else if (self->tensor.dtype() ==
+                   paddle::experimental::DataType::INT64) {
+          attrs["int64_values"] =
+              std::vector<int64_t>{value_obj_tmp.cast<int64_t>()};
+        } else if (self->tensor.dtype() ==
+                   paddle::experimental::DataType::BOOL) {
+          attrs["bool_values"] = std::vector<int>{value_obj_tmp.cast<bool>()};
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "When assign a value to a paddle.Tensor, "
+              "the data type of the paddle.Tensor must be bool, "
+              "float32, int32 or int64, "
+              "please check the type of tensor."));
+        }
+        attrs["shape"] = std::vector<int64_t>{1};
+
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Value type error. The assign value allows "
+            "numpy.ndarray, integer, float or bool, "
+            "but received %s.",
+            Py_TYPE(value_obj)));
+      }
+    }
+
+    {
+      // Release gil and do tracing
+      py::gil_scoped_release release;
+      self->tensor = set_value_dygraph_function(self->tensor, value_tensor, {},
+                                                {}, {}, attrs);
+    }
+  } else {
+    auto self_numpy = TensorToPyArray(*self_tensor);
+    VLOG(4) << "parse_index is false";
+    if (PyCheckTensor(_index)) {
+      VLOG(4) << "index is tensor";
+      auto index_tensor = static_cast<phi::DenseTensor*>(
+          reinterpret_cast<TensorObject*>(_index)->tensor.impl().get());
+      auto index_numpy = TensorToPyArray(*index_tensor);
+      self_numpy[index_numpy] = py::object(py::handle(value_obj), true);
+    } else {
+      VLOG(4) << "index is not tensor";
+      self_numpy[_index] = py::object(py::handle(value_obj), true);
+    }
+    if (self->tensor.place() == paddle::PlaceType::kUNK) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      SetTensorFromPyArray(self_tensor, self_numpy,
+                           platform::Place(platform::CUDAPlace(0)), false);
+#else
+      SetTensorFromPyArray(self_tensor, self_numpy,
+                           platform::Place(platform::CPUPlace()), false);
+#endif
+    } else {
+      SetTensorFromPyArray(self_tensor, self_numpy, self->tensor.inner_place(),
+                           false);
+    }
+  }
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args,
                                            PyObject* kwargs) {
   EAGER_TRY
@@ -703,12 +963,117 @@ static PyObject* set_grad_type(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method_get_non_zero_indices(TensorObject* self,
+                                                    PyObject* args,
+                                                    PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(self->tensor.is_sparse_coo_tensor(),
+                 paddle::platform::errors::Fatal(
+                     "this method is only effective for SparseCooTensor"));
+  auto sparse_coo_tensor =
+      std::dynamic_pointer_cast<phi::SparseCooTensor>(self->tensor.impl());
+  paddle::experimental::Tensor tensor(std::make_shared<phi::DenseTensor>(
+      sparse_coo_tensor->non_zero_indices()));
+  return ToPyObject(tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_non_zero_elements(TensorObject* self,
+                                                     PyObject* args,
+                                                     PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(
+      self->tensor.is_sparse_coo_tensor() ||
+          self->tensor.is_sparse_csr_tensor(),
+      paddle::platform::errors::Fatal("this method is only effective for "
+                                      "SparseCooTensor or SparseCsrTensor"));
+  if (self->tensor.is_sparse_coo_tensor()) {
+    auto sparse_coo_tensor =
+        std::dynamic_pointer_cast<phi::SparseCooTensor>(self->tensor.impl());
+    paddle::experimental::Tensor tensor(std::make_shared<phi::DenseTensor>(
+        sparse_coo_tensor->non_zero_elements()));
+    return ToPyObject(tensor);
+  } else {
+    auto sparse_csr_tensor =
+        std::dynamic_pointer_cast<phi::SparseCsrTensor>(self->tensor.impl());
+    paddle::experimental::Tensor tensor(std::make_shared<phi::DenseTensor>(
+        sparse_csr_tensor->non_zero_elements()));
+    return ToPyObject(tensor);
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_non_zero_crows(TensorObject* self,
+                                                  PyObject* args,
+                                                  PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(self->tensor.is_sparse_csr_tensor(),
+                 paddle::platform::errors::Fatal(
+                     "this method is only effective for SparseCsrTensor"));
+  auto sparse_csr_tensor =
+      std::dynamic_pointer_cast<phi::SparseCsrTensor>(self->tensor.impl());
+  paddle::experimental::Tensor tensor(
+      std::make_shared<phi::DenseTensor>(sparse_csr_tensor->non_zero_crows()));
+  return ToPyObject(tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_non_zero_cols(TensorObject* self,
+                                                 PyObject* args,
+                                                 PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(self->tensor.is_sparse_csr_tensor(),
+                 paddle::platform::errors::Fatal(
+                     "this method is only effective for SparseCsrTensor"));
+  auto sparse_csr_tensor =
+      std::dynamic_pointer_cast<phi::SparseCsrTensor>(self->tensor.impl());
+  paddle::experimental::Tensor tensor(
+      std::make_shared<phi::DenseTensor>(sparse_csr_tensor->non_zero_cols()));
+  return ToPyObject(tensor);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_is_sparse(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  return ToPyObject(self->tensor.is_sparse_coo_tensor() ||
+                    self->tensor.is_sparse_csr_tensor());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_is_sparse_coo(TensorObject* self, PyObject* args,
+                                             PyObject* kwargs) {
+  EAGER_TRY
+  return ToPyObject(self->tensor.is_sparse_coo_tensor());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_is_sparse_csr(TensorObject* self, PyObject* args,
+                                             PyObject* kwargs) {
+  EAGER_TRY
+  return ToPyObject(self->tensor.is_sparse_csr_tensor());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  uint32_t inplace_version = self->tensor.current_inplace_version();
+
+  return ToPyObject(inplace_version);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
     {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_is_initialized",
      (PyCFunction)(void (*)(void))tensor_method__is_initialized,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_is_dense_tensor_hold_allocation",
+     (PyCFunction)(
+         void (*)(void))tensor_method__is_dense_tensor_hold_allocation,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"_copy_to", (PyCFunction)(void (*)(void))tensor_method__copy_to,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"copy_", (PyCFunction)(void (*)(void))tensor_method_copy_,
@@ -741,6 +1106,9 @@ PyMethodDef variable_methods[] = {
     {"_getitem_index_not_tensor",
      (PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"__setitem_eager_tensor__",
+     (PyCFunction)(void (*)(void))tensor_method__setitem_eager_tensor,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"_register_grad_hook",
      (PyCFunction)(void (*)(void))tensor_register_grad_hook,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -751,6 +1119,28 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    /***the method of sparse tensor****/
+    {"non_zero_indices",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_indices,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"non_zero_elements",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_elements,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"non_zero_crows",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_crows,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"non_zero_cols",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_cols,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"is_sparse", (PyCFunction)(void (*)(void))tensor_method_is_sparse,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"is_sparse_coo", (PyCFunction)(void (*)(void))tensor_method_is_sparse_coo,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"is_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    /***the method of sparse tensor****/
+    {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 102cdbb91ab06..685e20aef2591 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -162,17 +162,22 @@ static inline std::string TempName(const std::string& name) {
 
 std::string GenerateOpFunctionsBody(
     const paddle::framework::proto::OpProto* op_proto, std::string func_name,
-    bool use_inplace_strategy = false,
     std::map<std::string, std::string> inplace_map = {}) {
   auto& op_type = op_proto->type();
   std::string input_args = "";
-  std::string call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  std::string call_api_str = "";
   std::string ins_initializer_with_null = "";
   std::string py_arg = "";
   int arg_idx = 0;
   int input_args_num = 0;
   std::string ins_cast_str = "";
   std::string view_strategy_str = "";
+  if (!inplace_map.empty()) {
+    // change call_api_str for inplace op
+    call_api_str = "auto out = " + op_type + "__dygraph_function(";
+  } else {
+    call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  }
   for (auto& input : op_proto->inputs()) {
     auto& in_name = input.name();
     // skip those dispensable inputs, like ResidualData in conv2d
@@ -288,8 +293,31 @@ std::string GenerateOpFunctionsBody(
         HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name,
         viwe_input_name, viwe_output_name);
   }
-
-  return_str = "return ToPyObject(out);";
+  if (!inplace_map.empty()) {
+    // For inplace op, Use the input PyObject directly.
+    for (auto& inplace_pair : inplace_map) {
+      // Find index of inplace tensor, and directly use input PyObject.
+      std::string inplace_arg_name = inplace_pair.second;
+      std::string inplace_return_name = inplace_pair.first;
+      const char* RETURN_INPLACE_TENSOR_TEMPLATE =
+          "ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_args_info, "
+          "\"%s\", \"%s\");\n"
+          "    ssize_t return_id = "
+          "GetIdxFromCoreOpsInfoMap(core_ops_returns_info, \"%s\", \"%s\");\n"
+          "    return ToPyObject(out, return_id, args, arg_id);";
+      return_str = paddle::string::Sprintf(RETURN_INPLACE_TENSOR_TEMPLATE,
+                                           op_type, inplace_arg_name, op_type,
+                                           inplace_return_name);
+      // only support one inplace_var in temporary.
+      PADDLE_ENFORCE_EQ(
+          inplace_map.size(), 1,
+          paddle::platform::errors::InvalidArgument(
+              "size of inplace_map must be 1, but got %d", inplace_map.size()));
+      break;
+    }
+  } else {
+    return_str = "return ToPyObject(out);";
+  }
 
   std::string function_args = "";
   if (input_args == "") {
@@ -383,7 +411,8 @@ GenerateOpFunctions() {
       continue;
     }
     std::string func_name = "eager_api_" + op_type;
-    std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name);
+    std::string op_function_str =
+        GenerateOpFunctionsBody(op_proto, func_name, {});
 
     // generate pybind item
     auto bind_function_str = paddle::string::Sprintf(
@@ -391,6 +420,40 @@ GenerateOpFunctions() {
 
     op_function_list.emplace_back(std::move(op_function_str));
     bind_function_list.emplace_back(std::move(bind_function_str));
+
+    // NOTE(pangyoki): Inplace Strategy.
+    // In this case, output will reuse input varbase.
+    // Dygraph mode needs to be aligned with the in-place strategy in static
+    // mode, and the mapping relationships between output and input that have
+    // been defined in static mode should be used in dygraph mode.
+    // Find which ops need to use Inplace strategy in static mode, and get the
+    // mapping relationship between Inplace output and input.
+    auto& infer_inplace =
+        paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
+    std::map<std::string, std::string> inplace_map;
+    // `sum` op has duplicate input. Don't consider adding inplace strategy
+    // for `sum` in temporary.
+    if (op_type != "sum" && infer_inplace) {
+      // Inplace OP: op_type_.
+      // The inplace OP needs a new implementation method.
+      auto in_to_outs = infer_inplace(true);
+      for (auto& inplace_pair : in_to_outs) {
+        inplace_map[inplace_pair.second] = inplace_pair.first;
+      }
+
+      std::string inplace_op_type = op_type + "_";
+      std::string inplace_func_name = "eager_api_" + inplace_op_type;
+      std::string inplace_op_function_str =
+          GenerateOpFunctionsBody(op_proto, inplace_func_name, inplace_map);
+
+      // generate pybind item
+      auto inplace_bind_function_str =
+          paddle::string::Sprintf(PYBIND_ITEM_TEMPLATE, inplace_op_type,
+                                  inplace_func_name, inplace_op_type);
+
+      op_function_list.emplace_back(std::move(inplace_op_function_str));
+      bind_function_list.emplace_back(std::move(inplace_bind_function_str));
+    }
   }
   if (append_custom_head_file) {
     op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE);
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 2572866b8f519..a610c31ee8946 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -52,6 +52,12 @@ PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+PyObject* tensor_properties_is_leaf(TensorObject* self, void* closure) {
+  EAGER_TRY
+  return ToPyObject(egr::egr_utils_api::IsLeafTensor(self->tensor));
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 int tensor_properties_set_name(TensorObject* self, PyObject* value,
                                void* closure) {
   EAGER_TRY
@@ -96,7 +102,7 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value,
                      "Detected NULL grad"
                      "Please check if you have manually cleared"
                      "the grad inside autograd_meta"));
-  grad->copy_(src, true);
+  grad->copy_(src, self->tensor.inner_place(), true);
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_ZERO
 }
@@ -179,6 +185,7 @@ struct PyGetSetDef variable_properties[] = {
      nullptr},
     {"dtype", (getter)tensor_properties_get_dtype, nullptr, nullptr, nullptr},
     {"type", (getter)tensor_properties_get_type, nullptr, nullptr, nullptr},
+    {"is_leaf", (getter)tensor_properties_is_leaf, nullptr, nullptr, nullptr},
     {nullptr, nullptr, nullptr, nullptr, nullptr}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 217edad0c0a10..2e884b212aff3 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace paddle {
 namespace pybind {
 
@@ -62,6 +63,8 @@ int TensorDtype2NumpyDtype(phi::DataType dtype) {
       return pybind11::detail::npy_api::NPY_INT32_;
     case phi::DataType::INT64:
       return pybind11::detail::npy_api::NPY_INT64_;
+    case phi::DataType::BFLOAT16:
+      return pybind11::detail::NPY_UINT16_;
     case phi::DataType::FLOAT16:
       return pybind11::detail::NPY_FLOAT16_;
     case phi::DataType::FLOAT32:
@@ -417,6 +420,8 @@ PyObject* ToPyObject(bool value) {
 
 PyObject* ToPyObject(int value) { return PyLong_FromLong(value); }
 
+PyObject* ToPyObject(uint32_t value) { return PyLong_FromUnsignedLong(value); }
+
 PyObject* ToPyObject(int64_t value) { return PyLong_FromLongLong(value); }
 
 PyObject* ToPyObject(float value) { return PyLong_FromDouble(value); }
@@ -442,6 +447,20 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value) {
   return obj;
 }
 
+PyObject* ToPyObject(const paddle::experimental::Tensor& value,
+                     ssize_t value_idx, PyObject* args, ssize_t arg_idx) {
+  // For inplace op, directly return the input PyObject of the inplace tensor.
+  // [Parameter]
+  // value: Useless parameter.
+  // value_idx: Useless parameter.
+  // args: Input PyObject.
+  // arg_idx: Index of inplace PyObject in input args. Used to find the input
+  // inplace PyObject.
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+  Py_INCREF(obj);
+  return obj;
+}
+
 PyObject* ToPyObject(const std::vector<bool>& value) {
   PyObject* result = PyList_New((Py_ssize_t)value.size());
 
@@ -492,20 +511,26 @@ PyObject* ToPyObject(const std::vector<double>& value) {
   return result;
 }
 
-PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value) {
+PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
+                     bool return_py_none_if_not_initialize) {
   PyObject* result = PyList_New((Py_ssize_t)value.size());
 
   for (size_t i = 0; i < value.size(); i++) {
-    PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
-    if (obj) {
-      auto v = reinterpret_cast<TensorObject*>(obj);
-      new (&(v->tensor)) paddle::experimental::Tensor();
-      v->tensor = value[i];
+    if (!value[i].initialized() && return_py_none_if_not_initialize) {
+      Py_INCREF(Py_None);
+      PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), Py_None);
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "tp_alloc return null, can not new a PyObject."));
+      PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
+      if (obj) {
+        auto v = reinterpret_cast<TensorObject*>(obj);
+        new (&(v->tensor)) paddle::experimental::Tensor();
+        v->tensor = value[i];
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "tp_alloc return null, can not new a PyObject."));
+      }
+      PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
     }
-    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
   }
 
   return result;
@@ -825,7 +850,7 @@ paddle::experimental::ScalarArray CastPyArg2ScalarArray(
   // obj could be: int, float, bool, paddle.Tensor
   PyTypeObject* type = obj->ob_type;
   auto type_name = std::string(type->tp_name);
-  if (type_name == "list") {
+  if (type_name == "list" || type_name == "tuple") {
     std::vector<int> value = CastPyArg2Ints(obj, op_type, arg_pos);
     return paddle::experimental::ScalarArray(value);
 
@@ -904,28 +929,10 @@ std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
   return result;
 }
 
-paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
-                                                const std::string& op_type,
-                                                ssize_t arg_pos) {
-  if (obj == Py_None) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "int or place, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-
-  PyTypeObject* type = obj->ob_type;
-  auto type_name = std::string(type->tp_name);
-  if (type_name == "int") {
-    int value = CastPyArg2Int(obj, op_type, arg_pos);
-    return static_cast<paddle::experimental::Backend>(value);
-  } else {
-    platform::Place place = CastPyArg2Place(obj, arg_pos);
-    return phi::TransToPhiBackend(place);
-  }
-
-  return paddle::experimental::Backend::CPU;
+paddle::experimental::Place CastPyArg2Place(PyObject* obj,
+                                            const std::string& op_type,
+                                            ssize_t arg_pos) {
+  return CastPyArg2Place(obj, arg_pos);
 }
 
 paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 2187555e1c3c7..3500082ba645f 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -56,6 +56,7 @@ framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
                                                     ssize_t arg_pos);
 
 PyObject* ToPyObject(int value);
+PyObject* ToPyObject(uint32_t value);
 PyObject* ToPyObject(bool value);
 PyObject* ToPyObject(int64_t value);
 PyObject* ToPyObject(float value);
@@ -63,12 +64,15 @@ PyObject* ToPyObject(double value);
 PyObject* ToPyObject(const char* value);
 PyObject* ToPyObject(const std::string& value);
 PyObject* ToPyObject(const paddle::experimental::Tensor& value);
+PyObject* ToPyObject(const paddle::experimental::Tensor& value,
+                     ssize_t value_idx, PyObject* args, ssize_t arg_idx);
 PyObject* ToPyObject(const std::vector<bool>& value);
 PyObject* ToPyObject(const std::vector<int>& value);
 PyObject* ToPyObject(const std::vector<int64_t>& value);
 PyObject* ToPyObject(const std::vector<float>& value);
 PyObject* ToPyObject(const std::vector<double>& value);
-PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value);
+PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
+                     bool return_py_none_if_not_initialize = false);
 PyObject* ToPyObject(const platform::Place& value);
 PyObject* ToPyObject(const framework::LoDTensor* value);
 PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype);
@@ -83,6 +87,17 @@ struct TupleTensorResult {
     TupleTensorResult<Tuple, N - 1>::Run(out, result);
     PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
   }
+
+  static void Run(const Tuple& out, PyObject* result, ssize_t value_idx,
+                  PyObject* args, ssize_t arg_idx) {
+    TupleTensorResult<Tuple, N - 1>::Run(out, result, value_idx, args, arg_idx);
+    if (N - 1 == value_idx) {
+      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out),
+                                                 value_idx, args, arg_idx));
+    } else {
+      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
+    }
+  }
 };
 
 template <typename Tuple>
@@ -90,6 +105,16 @@ struct TupleTensorResult<Tuple, 1> {
   static void Run(const Tuple& out, PyObject* result) {
     PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
   }
+
+  static void Run(const Tuple& out, PyObject* result, ssize_t value_idx,
+                  PyObject* args, ssize_t arg_idx) {
+    if (value_idx == 0) {
+      PyTuple_SET_ITEM(result, 0,
+                       ToPyObject(std::get<0>(out), value_idx, args, arg_idx));
+    } else {
+      PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
+    }
+  }
 };
 
 template <typename... Args>
@@ -102,6 +127,26 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
+template <typename... Args>
+PyObject* ToPyObject(const std::tuple<Args...>& out, ssize_t value_idx,
+                     PyObject* args, ssize_t arg_idx) {
+  // For inplace op, directly return the input PyObject of the inplace tensor.
+  // [Parameter]
+  // out: Outputs tuple after executing op.
+  // value_idx: Index of inplace tensor in outputs tuple. Used to find the
+  // output inplace tensor.
+  // args: Input PyObject.
+  // arg_idx: Index of inplace PyObject in input args. Used to find the input
+  // inplace PyObject.
+  auto len = sizeof...(Args);
+  PyObject* result = PyTuple_New(len);
+
+  TupleTensorResult<decltype(out), sizeof...(Args)>::Run(out, result, value_idx,
+                                                         args, arg_idx);
+
+  return result;
+}
+
 paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
                                               const std::string& op_type,
                                               ssize_t arg_pos);
@@ -109,9 +154,9 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
 paddle::experimental::ScalarArray CastPyArg2ScalarArray(
     PyObject* obj, const std::string& op_type, ssize_t arg_pos);
 
-paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
-                                                const std::string& op_type,
-                                                ssize_t arg_pos);
+paddle::experimental::Place CastPyArg2Place(PyObject* obj,
+                                            const std::string& op_type,
+                                            ssize_t arg_pos);
 
 paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
                                                   const std::string& op_type,
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 3145a9cf7655c..01dae420cc6ab 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -225,7 +225,7 @@ void BindGraphPyClient(py::module* m) {
       .def("stop_server", &GraphPyClient::stop_server)
       .def("get_node_feat",
            [](GraphPyClient& self, std::string node_type,
-              std::vector<uint64_t> node_ids,
+              std::vector<int64_t> node_ids,
               std::vector<std::string> feature_names) {
              auto feats =
                  self.get_node_feat(node_type, node_ids, feature_names);
@@ -239,7 +239,7 @@ void BindGraphPyClient(py::module* m) {
            })
       .def("set_node_feat",
            [](GraphPyClient& self, std::string node_type,
-              std::vector<uint64_t> node_ids,
+              std::vector<int64_t> node_ids,
               std::vector<std::string> feature_names,
               std::vector<std::vector<py::bytes>> bytes_feats) {
              std::vector<std::vector<std::string>> feats(bytes_feats.size());
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 9b373a58181f1..7a00f91da2e36 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -52,11 +52,13 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/op_function.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 #include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
+#include "paddle/phi/core/compat/type_defs.h"
 
 namespace paddle {
 namespace pybind {
@@ -117,7 +119,11 @@ class PyVariableWrapperHook : public imperative::VariableWrapperHook {
       return var;
     }
 
-    return PyObjectCast<std::shared_ptr<imperative::VarBase>>(res)->SharedVar();
+    auto res_varbase = PyObjectCast<std::shared_ptr<imperative::VarBase>>(res);
+    // Here the reference count of `res` is 2, so we decreases the reference
+    // count manually to avoid memory leaks
+    Py_DECREF(res);
+    return res_varbase->SharedVar();
   }
 
  private:
@@ -380,46 +386,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
   return result;
 }
 
-// cast numpy type form S to T, this may allocate new memory
-template <class T, class S>
-static py::array_t<T> CastNumpyType(py::array_t<S> array) {
-  if (std::is_same<T, S>::value) {
-    return array;
-  }
-  auto dim = array.ndim();
-  std::vector<py::ssize_t> result_shape(dim);
-  for (auto i = 0; i < dim; i++) {
-    result_shape[i] = array.shape(i);
-  }
-
-  py::array_t<T> result(result_shape);
-
-  return py::vectorize([](S s) { return static_cast<T>(s); })(array);
-}
-
-template <class T>
-static py::array_t<T> CastNumpyArray(const py::object &array) {
-  if (py::isinstance<py::array_t<float>>(array)) {
-    return CastNumpyType<T>(array.cast<py::array_t<float>>());
-  } else if (py::isinstance<py::array_t<double>>(array)) {
-    return CastNumpyType<T>(array.cast<py::array_t<double>>());
-  } else if (py::isinstance<py::array_t<int32_t>>(array)) {
-    return CastNumpyType<T>(array.cast<py::array_t<int32_t>>());
-  } else if (py::isinstance<py::array_t<int64_t>>(array)) {
-    return CastNumpyType<T>(array.cast<py::array_t<int64_t>>());
-  } else if (py::isinstance<py::array_t<bool>>(array)) {
-    return CastNumpyType<T>(array.cast<py::array_t<bool>>());
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Value type error. The assign numpy value allows integer, float, "
-        "double and bool, "
-        "but received %s.",
-        Py_TYPE(array.ptr())->tp_name));
-  }
-  // can't reach here
-  return py::array_t<T>();
-}
-
 static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
     const PyNameVarBaseMap &map) {
   imperative::NameVarBaseMap result;
@@ -436,6 +402,28 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
   return result;
 }
 
+paddle::imperative::NameTensorMap ConvertToNameTensorMap(
+    const PyNameVarBaseMap &map) {
+  paddle::imperative::NameTensorMap result;
+  for (auto &pair : map) {
+    auto var_vec = CastPyArg2VectorOfTensor(pair.second.ptr(), 0);
+    if (!var_vec.empty()) {
+      // change vector<Tensor> -> vector<shared_ptr<Tensor>>
+      std::vector<std::shared_ptr<egr::EagerVariable>> dst_var_vec;
+      for (auto &v : var_vec) {
+        dst_var_vec.emplace_back(
+            std::make_shared<egr::EagerVariable>(std::move(v)));
+      }
+      result.emplace(pair.first, std::move(dst_var_vec));
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(
+      PyErr_Occurred(), nullptr,
+      platform::errors::InvalidArgument(py::str(py::handle(PyErr_Occurred()))));
+  return result;
+}
+
 template <typename P>
 static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
                         imperative::VarBase &dst,                   // NOLINT
@@ -826,27 +814,29 @@ void BindImperative(py::module *m_ptr) {
                 py::object value = value_obj;
                 if (self->DataType() == framework::proto::VarType::FP32) {
                   if (!py::isinstance<py::array_t<float>>(value_obj)) {
-                    value = CastNumpyArray<float>(value_obj);
+                    value = pybind11::detail::CastNumpyArray<float>(value_obj);
                   }
                 } else if (self->DataType() ==
                            framework::proto::VarType::FP64) {
                   if (!py::isinstance<py::array_t<double>>(value_obj)) {
-                    value = CastNumpyArray<double>(value_obj);
+                    value = pybind11::detail::CastNumpyArray<double>(value_obj);
                   }
                 } else if (self->DataType() ==
                            framework::proto::VarType::INT32) {
                   if (!py::isinstance<py::array_t<int32_t>>(value_obj)) {
-                    value = CastNumpyArray<int32_t>(value_obj);
+                    value =
+                        pybind11::detail::CastNumpyArray<int32_t>(value_obj);
                   }
                 } else if (self->DataType() ==
                            framework::proto::VarType::INT64) {
                   if (!py::isinstance<py::array_t<int64_t>>(value_obj)) {
-                    value = CastNumpyArray<int64_t>(value_obj);
+                    value =
+                        pybind11::detail::CastNumpyArray<int64_t>(value_obj);
                   }
                 } else if (self->DataType() ==
                            framework::proto::VarType::BOOL) {
                   if (!py::isinstance<py::array_t<bool>>(value_obj)) {
-                    value = CastNumpyArray<bool>(value_obj);
+                    value = pybind11::detail::CastNumpyArray<bool>(value_obj);
                   }
                 } else {
                   PADDLE_THROW(platform::errors::InvalidArgument(
@@ -2079,8 +2069,8 @@ void BindImperative(py::module *m_ptr) {
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
               framework::AttributeMap attrs) {
              // TODO(xiongkun): move this function outside of tracer.
-             auto ins_map = ConvertToNameVarBaseMap(ins);
-             auto outs_map = ConvertToNameVarBaseMap(outs);
+             auto ins_map = ConvertToNameTensorMap(ins);
+             auto outs_map = ConvertToNameTensorMap(outs);
              {
                auto to_vector = [](paddle::SmallVector<std::string> &vec) {
                  return std::vector<std::string>(vec.begin(), vec.end());
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index b008308e27d9a..c8f0acd0b8a85 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -551,6 +551,9 @@ void BindAnalysisConfig(py::module *m) {
       .def("params_file", &AnalysisConfig::params_file)
       .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
            py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
+      .def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16,
+           py::arg("gpu_fp16_disabled_op_types") =
+               std::unordered_set<std::string>({}))
       .def("enable_xpu", &AnalysisConfig::EnableXpu,
            py::arg("l3_workspace_size") = 16 * 1024 * 1024,
            py::arg("locked") = false, py::arg("autotune") = true,
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index bb45c1c40603f..ecbacd37d5666 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -143,6 +143,7 @@ void BindNode(py::module *m) {
       .def("var", &Node::Var, return_value_policy::reference)
       .def("op", &Node::Op, return_value_policy::reference)
       .def("id", &Node::id)
+      .def("graph_id", &Node::GraphId)
       .def("original_desc_id", &Node::OriginalDescId)
       .def("is_op", &Node::IsOp)
       .def("is_var", &Node::IsVar)
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
index 8d78adaf5a473..1520174fba288 100644
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ b/paddle/fluid/pybind/kernel_signature_generator.cc
@@ -46,10 +46,19 @@ int main(int argc, char **argv) {
   auto &kernel_factory = phi::KernelFactory::Instance();
   std::string kernel_signature_map_str{"{"};
   for (const auto &op_kernel_pair : kernel_factory.kernels()) {
-    if (kernel_signature_map.Has(op_kernel_pair.first)) {
+    std::string op_name = op_kernel_pair.first;
+    const paddle::flat_hash_map<std::string, std::string> &kernel_name_map =
+        phi::OpUtilsMap::Instance().base_kernel_name_map();
+    for (auto &it : kernel_name_map) {
+      if (it.second == op_name) {
+        op_name = it.first;
+        break;
+      }
+    }
+    if (kernel_signature_map.Has(op_name)) {
       kernel_signature_map_str =
           kernel_signature_map_str + "\"" + op_kernel_pair.first + "\":{";
-      auto &args = kernel_signature_map.Get(op_kernel_pair.first).args;
+      auto &args = kernel_signature_map.Get(op_name).args;
 
       kernel_signature_map_str += "\"inputs\":[";
       auto inputs_ = std::get<0>(args);
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 09c3cea398b2a..1d483abd7746c 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -854,5 +854,30 @@ void InitOpsAttrTypeMap() {
   }
 }
 
+ssize_t GetIdxFromCoreOpsInfoMap(
+    const std::unordered_map<std::string, std::vector<std::string>>&
+        core_ops_info_map,
+    const std::string& op_type, const std::string& name) {
+  // `core_ops_info_map` can be `core_ops_args_info` or `core_ops_returns_info`.
+  // `core_ops_args_info`: get index from core_ops_args_info[op_type] according
+  // to input name.
+  // `core_ops_returns_info`: get index from core_ops_returns_info[op_type]
+  // according to return name.
+  if (!core_ops_info_map.count(op_type)) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Op %s is not found in core_ops_*_info map.", op_type));
+  } else {
+    auto args_list = core_ops_info_map.at(op_type);
+    auto it = std::find(args_list.begin(), args_list.end(), name);
+    if (it == args_list.end()) {
+      PADDLE_THROW(platform::errors::Fatal("%s is not found in op %s's args.",
+                                           name, op_type));
+    } else {
+      return std::distance(args_list.begin(), it);
+    }
+  }
+  return -1;
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h
index 7ead985266725..33d0e242a027d 100644
--- a/paddle/fluid/pybind/op_function_common.h
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -146,5 +146,10 @@ unsigned long GetUnsignedLongFromArgs(  // NOLINT
 
 void InitOpsAttrTypeMap();
 
+ssize_t GetIdxFromCoreOpsInfoMap(
+    const std::unordered_map<std::string, std::vector<std::string>>&
+        core_ops_info_map,
+    const std::string& op_type, const std::string& name);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index 9e86e3df8a688..65b5beb865d1c 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -38,7 +38,15 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"assign", {"X"}},
     {"reshape2", {"X", "Shape"}},
     {"expand", {"X", "ExpandTimes"}},
-    {"slice", {"Input", "StartsTensor", "EndsTensor"}},
+    {"slice",
+     {"Input", "StartsTensor", "EndsTensor", "StartsTensorList",
+      "EndsTensorList"}},
+    {"strided_slice",
+     {"Input", "StartsTensor", "EndsTensor", "StridesTensor",
+      "StartsTensorList", "EndsTensorList", "StridesTensorList"}},
+    {"set_value",
+     {"Input", "ValueTensor", "StartsTensorList", "EndsTensorList",
+      "StepsTensorList"}},
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"X", "InScale", "InAccum", "InState"}},
     {"nll_loss", {"X", "Label", "Weight"}},
@@ -88,6 +96,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"nce",
      {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs",
       "CustomDistAlias", "CustomDistAliasProbs"}},
+    {"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}},
+    {"group_norm", {"X", "Scale", "Bias"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 21bbc7f3e369b..f5c853fb4b8ee 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -114,6 +114,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/metrics_py.h"
 #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/phi/backends/device_manager.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
@@ -742,6 +743,11 @@ PYBIND11_MODULE(core_noavx, m) {
   // stored in this static instance to avoid illegal memory access.
   m.def("clear_kernel_factory",
         []() { phi::KernelFactory::Instance().kernels().clear(); });
+  m.def("clear_device_manager", []() {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    phi::DeviceManager::Clear();
+#endif
+  });
 
   // NOTE(zjl): ctest would load environment variables at the beginning even
   // though we have not `import paddle.fluid as fluid`. So we add this API
@@ -1755,6 +1761,7 @@ All parameter, weight, gradient are variables in Paddle.
                out (core.Variable|None): the found variable or None.
            )DOC",
            py::return_value_policy::reference)
+      .def("size", &Scope::Size)
       .def("erase", &Scope::EraseVars, py::arg("names"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
@@ -2851,6 +2858,9 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](StandaloneExecutor &self, std::vector<std::string> feed_names,
               std::vector<std::string> fetch_names) {
+             platform::RecordEvent record_event(
+                 "StandaloneExecutor:run",
+                 platform::TracerEventType::UserDefined, 1);
              paddle::framework::FetchList ret;
              {
                pybind11::gil_scoped_release release;
@@ -3312,6 +3322,7 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<paddle::platform::Profiler>(m, "_Profiler")
       .def("create", &paddle::platform::Profiler::Create,
            py::return_value_policy::take_ownership)
+      .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
       .def("prepare",
            [](paddle::platform::Profiler *profiler) {
              platform::EnableHostEventRecorder();
@@ -4258,6 +4269,7 @@ All parameter, weight, gradient are variables in Paddle.
                  platform::ipu::IpuBackend::GetInstance());
            },
            py::return_value_policy::reference)
+      .def("weights_to_host", &platform::ipu::IpuBackend::WeightsToHost)
       .def("detach", &platform::ipu::IpuBackend::Detach)
       .def("reset", &platform::ipu::IpuBackend::Reset)
       .def("set_scope", &platform::ipu::IpuBackend::SetScope)
@@ -4305,6 +4317,15 @@ All parameter, weight, gradient are variables in Paddle.
                          option_name, option.first.cast<std::string>(),
                          option.second.cast<std::uint64_t>());
                    }
+                 } else if (option_name == "accumulate_outer_fragment") {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     std::vector<int> values;
+                     for (auto value : option.second.cast<py::list>()) {
+                       values.push_back(value.cast<int>());
+                     }
+                     self.SetAccumulateOuterFragmentSettings(
+                         option.first.cast<std::uint64_t>(), values);
+                   }
                  } else if (option_name == "custom_op") {
                    std::string paddle_op;
                    std::string popart_op;
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 6849fcb039410..bf459bd468421 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -52,6 +52,46 @@ constexpr int NPY_UINT16_ = 4;
 constexpr int NPY_COMPLEX64 = 14;
 constexpr int NPY_COMPLEX128 = 15;
 
+// cast numpy type form S to T, this may allocate new memory
+template <class T, class S>
+static py::array_t<T> CastNumpyType(py::array_t<S> array) {
+  if (std::is_same<T, S>::value) {
+    return array;
+  }
+  auto dim = array.ndim();
+  std::vector<py::ssize_t> result_shape(dim);
+  for (auto i = 0; i < dim; i++) {
+    result_shape[i] = array.shape(i);
+  }
+
+  py::array_t<T> result(result_shape);
+
+  return py::vectorize([](S s) { return static_cast<T>(s); })(array);
+}
+
+template <class T>
+static py::array_t<T> CastNumpyArray(const py::object &array) {
+  if (py::isinstance<py::array_t<float>>(array)) {
+    return CastNumpyType<T>(array.cast<py::array_t<float>>());
+  } else if (py::isinstance<py::array_t<double>>(array)) {
+    return CastNumpyType<T>(array.cast<py::array_t<double>>());
+  } else if (py::isinstance<py::array_t<int32_t>>(array)) {
+    return CastNumpyType<T>(array.cast<py::array_t<int32_t>>());
+  } else if (py::isinstance<py::array_t<int64_t>>(array)) {
+    return CastNumpyType<T>(array.cast<py::array_t<int64_t>>());
+  } else if (py::isinstance<py::array_t<bool>>(array)) {
+    return CastNumpyType<T>(array.cast<py::array_t<bool>>());
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Value type error. The assign numpy value allows integer, float, "
+        "double and bool, "
+        "but received %s.",
+        Py_TYPE(array.ptr())->tp_name));
+  }
+  // can't reach here
+  return py::array_t<T>();
+}
+
 // Note: Since float16 is not a builtin type in C++, we register
 // paddle::platform::float16 as numpy.float16.
 // Ref: https://github.com/pybind/pybind11/issues/1776
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 4e273f6d551ed..e777a8e3ab4e6 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -3,12 +3,22 @@ if (NOT WITH_INFRT)
 endif()
 
 option(INFRT_WITH_PHI  "Compile INFRT with PHI"    ON)
+option(INFRT_WITH_GPU  "Compile INFRT with GPU"    OFF)
+option(INFRT_WITH_TRT  "Compile INFRT with TensorRT"    OFF)
 
 #TODO(xiaowei) remove fluid
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 
 if (INFRT_WITH_PHI)
-    add_definitions("-DINFRT_WITH_PHI")
+  add_definitions("-DINFRT_WITH_PHI")
+
+  # TODO(wilber): Now Infrt gpu/trt depends on phi's components, Modify compile dependency options later.
+  if (INFRT_WITH_GPU)
+    add_definitions("-DINFRT_WITH_GPU")
+    if (INFRT_WITH_TRT)
+      add_definitions("-DINFRT_WITH_TRT")
+    endif()
+  endif()
 endif()
 
 # compile flags
@@ -92,7 +102,6 @@ set(infrt_mlir_incs
         test_kernels_inc
         tensor_shape_inc
         dense_tensor_inc
-        pd_ops_inc
         pd_extra_ops_inc
         trt_ops_inc
         )
@@ -106,6 +115,9 @@ if (INFRT_WITH_PHI)
 endif()
 
 cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
+if (INFRT_WITH_TRT)
+  target_link_libraries(infrt infrt_trt)
+endif()
 cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto)
 add_dependencies(infrt ${infrt_mlir_incs} mlir-headers)
 
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index 0500a8123044c..5ac51fb671557 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -129,7 +129,7 @@ class PredictExecutor : public MlirToRuntimeTranslator {
       auto arg = predict_func.getArgument(i);
       auto type = arg.getType();
       // this param is TensorMap
-      if (type.isa<infrt::DenseTensorMapType>()) {
+      if (type.isa<infrt::DenseHostTensorMapType>()) {
         auto* value = new host_context::Value(std::move(*map));
         arguments_.push_back(value);
         AddValue(predict_func.getArgument(i), value);
diff --git a/paddle/infrt/backends/host/phi_allocator.h b/paddle/infrt/backends/host/phi_allocator.h
index c8f97e04a1b83..6e3bef9299162 100644
--- a/paddle/infrt/backends/host/phi_allocator.h
+++ b/paddle/infrt/backends/host/phi_allocator.h
@@ -13,6 +13,10 @@ limitations under the License. */
 
 #include "paddle/phi/core/allocator.h"
 
+#ifdef INFRT_WITH_GPU
+#include <cuda_runtime.h>
+#endif
+
 namespace infrt {
 namespace backends {
 
@@ -29,5 +33,22 @@ class CpuPhiAllocator : public phi::Allocator {
   }
 };
 
+#ifdef INFRT_WITH_GPU
+// TODO(wilber): Just for demo test. we need a more efficient gpu allocator.
+class GpuPhiAllocator : public phi::Allocator {
+ public:
+  static void deleter(phi::Allocation* ptr) { cudaFree(ptr->ptr()); }
+
+  AllocationPtr Allocate(size_t bytes_size) {
+    void* ptr;
+    cudaMalloc(&ptr, bytes_size);
+    return AllocationPtr(
+        new phi::Allocation(
+            ptr, bytes_size, phi::Place(phi::AllocationType::GPU)),
+        deleter);
+  }
+};
+#endif
+
 }  // namespace backends
 }  // namespace infrt
diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h
index 5713fdbbaf82b..bcd63dbb39fe8 100644
--- a/paddle/infrt/backends/host/phi_context.h
+++ b/paddle/infrt/backends/host/phi_context.h
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 
 namespace infrt {
 namespace backends {
@@ -31,5 +32,16 @@ class CpuPhiContext : public phi::CPUContext {
   std::unique_ptr<phi::Allocator> alloc_{std::make_unique<CpuPhiAllocator>()};
 };
 
+class GpuPhiContext : public phi::GPUContext {
+ public:
+  using Base = phi::GPUContext;
+  using phi::GPUContext::SetStream;
+  using phi::GPUContext::SetEigenDevice;
+  using phi::GPUContext::SetBlasHandle;
+  using phi::GPUContext::SetDnnHandle;
+  using phi::GPUContext::SetSolverHandle;
+  using phi::GPUContext::SetSparseHandle;
+};
+
 }  // namespace backends
 }  // namespace infrt
diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
index 12cf14060e27c..89dd3b0dc7abf 100644
--- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc
@@ -37,9 +37,9 @@ namespace infrt {
 namespace backends {
 namespace tensorrt {
 
-const char* model_input = "model_input";
-const char* model_output = "model_output1";
-const char* model_output2 = "model_output2";
+const char* model_input = "input_0";
+const char* model_output = "output_0";
+const char* model_output2 = "output_1";
 
 TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
     nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
@@ -82,9 +82,176 @@ TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructNetwork(
   return network;
 }
 
+TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructFCNetwork(
+    nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
+  TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  if (is_static_shape) {
+    network.reset(builder->createNetworkV2(0U));
+  } else {
+    auto networkFlags =
+        1U << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    network.reset(builder->createNetworkV2(networkFlags));
+  }
+
+  ITensor* data =
+      network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims);
+  CHECK_NOTNULL(data);
+  nvinfer1::Weights kernel_weights;
+  kernel_weights.type = nvinfer1::DataType::kFLOAT;
+  kernel_weights.count = 7840;
+  std::vector<float> weight_data(kernel_weights.count);
+  for (size_t i = 0; i < weight_data.size(); ++i) {
+    weight_data[i] = i % 255 * 0.02f;
+  }
+  kernel_weights.values = weight_data.data();
+  auto* layer = network->addFullyConnected(
+      *data, 10, kernel_weights, nvinfer1::Weights{});
+  CHECK_NOTNULL(layer);
+  auto* out = layer->getOutput(0);
+  out->setName(model_output);
+  network->markOutput(*out);
+  return network;
+}
+
+TrtUniquePtr<nvinfer1::INetworkDefinition> ConstructConvNetwork(
+    nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) {
+  TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  if (is_static_shape) {
+    network.reset(builder->createNetworkV2(0U));
+  } else {
+    auto networkFlags =
+        1U << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    network.reset(builder->createNetworkV2(networkFlags));
+  }
+
+  ITensor* data =
+      network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims);
+  CHECK_NOTNULL(data);
+  nvinfer1::Weights kernel_weights, bias_weights;
+  kernel_weights.type = nvinfer1::DataType::kFLOAT;
+  bias_weights.type = nvinfer1::DataType::kFLOAT;
+  kernel_weights.count = 81;
+  bias_weights.count = 3;
+  std::vector<float> weight_data(kernel_weights.count);
+  for (size_t i = 0; i < weight_data.size(); ++i) {
+    weight_data[i] = i * 0.02f;
+  }
+  std::vector<float> bias_data(bias_weights.count);
+  for (size_t i = 0; i < bias_data.size(); ++i) {
+    bias_data[i] = i * 0.5f;
+  }
+  kernel_weights.values = weight_data.data();
+  bias_weights.values = bias_data.data();
+  nvinfer1::Dims ksize;
+  ksize.nbDims = 2;
+  ksize.d[0] = 3;
+  ksize.d[1] = 3;
+  auto* layer =
+      network->addConvolutionNd(*data, 3, ksize, kernel_weights, bias_weights);
+  CHECK_NOTNULL(layer);
+  auto* out = layer->getOutput(0);
+  out->setName(model_output);
+  network->markOutput(*out);
+  return network;
+}
+
 // sigmoid(x) = 1 / (1 + exp(-x))
 inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); }
 
+TEST(trt, run_fc_static) {
+  TrtEngine engine(0);
+  auto net = ConstructFCNetwork(
+      engine.GetTrtBuilder(), nvinfer1::Dims3{1, 28, 28}, true);
+  BuildOptions build_options;
+  build_options.max_batch = 4;
+  build_options.workspace = 1024;
+  engine.Build(std::move(net), build_options);
+
+  InferenceOptions inference_options;
+  inference_options.batch = 1;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 1, 28, 28}));
+  phi::DenseTensor input;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 1 * 28 * 28, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  engine.PrepareOutputHandle("output_0");
+  engine.SetUpInference(inference_options, inputs);
+  engine.GetEngineInfo();
+  engine.Run(context);
+  cudaStreamSynchronize(context.stream());
+}
+
+TEST(trt, run_conv_static) {
+  TrtEngine engine(0);
+  auto net = ConstructConvNetwork(
+      engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true);
+  BuildOptions build_options;
+  build_options.max_batch = 4;
+  build_options.workspace = 1024;
+  engine.Build(std::move(net), build_options);
+
+  InferenceOptions inference_options;
+  inference_options.batch = 1;
+
+  phi::GPUPlace place;
+  phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(place, context.stream())
+                           .get());
+  context.PartialInitWithAllocator();
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({inference_options.batch, 3, 28, 28}));
+  phi::DenseTensor input;
+  input.set_meta(meta);
+  context.Alloc<float>(&input, input.numel() * sizeof(float));
+  std::vector<float> host_data(inference_options.batch * 3 * 28 * 28, 0);
+  for (size_t i = 0; i < host_data.size(); ++i) {
+    host_data[i] = i % 100 * 0.016f;
+  }
+  paddle::memory::Copy(place,
+                       input.data<float>(),
+                       phi::CPUPlace(),
+                       host_data.data(),
+                       sizeof(float) * host_data.size(),
+                       context.stream());
+
+  std::unordered_map<std::string, phi::DenseTensor*> inputs;
+  inputs.emplace(std::make_pair(model_input, &input));
+  engine.PrepareOutputHandle("output_0");
+  engine.SetUpInference(inference_options, inputs);
+  engine.GetEngineInfo();
+  engine.Run(context);
+  cudaStreamSynchronize(context.stream());
+}
+
 TEST(trt, run_static) {
   TrtEngine static_trt_engine(0);
   auto net = ConstructNetwork(
@@ -122,27 +289,26 @@ TEST(trt, run_static) {
 
   std::unordered_map<std::string, phi::DenseTensor*> inputs;
   inputs.emplace(std::make_pair(model_input, &input));
-  phi::DenseTensor output, output2;
-  std::unordered_map<std::string, phi::DenseTensor*> outputs;
-  outputs.emplace(std::make_pair(model_output, &output));
-  outputs.emplace(std::make_pair(model_output2, &output2));
-
-  static_trt_engine.SetUpInference(inference_options, inputs, &outputs);
+  static_trt_engine.PrepareOutputHandle("output_0");
+  static_trt_engine.PrepareOutputHandle("output_1");
+  static_trt_engine.SetUpInference(inference_options, inputs);
   static_trt_engine.GetEngineInfo();
   static_trt_engine.Run(context);
 
+  phi::DenseTensor* output0 = static_trt_engine.GetOutput("output_0");
+  phi::DenseTensor* output1 = static_trt_engine.GetOutput("output_1");
   std::vector<float> output_data1(inference_options.batch * 1 * 28 * 28, 0);
   std::vector<float> output_data2(inference_options.batch * 2 * 28 * 28, 0);
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data1.data(),
                        place,
-                       output.data<float>(),
+                       output0->data<float>(),
                        sizeof(float) * output_data1.size(),
                        context.stream());
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data2.data(),
                        place,
-                       output2.data<float>(),
+                       output1->data<float>(),
                        sizeof(float) * output_data2.size(),
                        context.stream());
   cudaStreamSynchronize(context.stream());
@@ -208,27 +374,27 @@ TEST(trt, run_dynamic) {
                        context.stream());
 
   std::unordered_map<std::string, phi::DenseTensor*> inputs;
-  std::unordered_map<std::string, phi::DenseTensor*> outputs;
   inputs.emplace(std::make_pair(model_input, &input));
-  outputs.emplace(std::make_pair(model_output, &output));
-  outputs.emplace(std::make_pair(model_output2, &output2));
-
-  engine.SetUpInference(inference_options, inputs, &outputs);
+  engine.PrepareOutputHandle("output_0");
+  engine.PrepareOutputHandle("output_1");
+  engine.SetUpInference(inference_options, inputs);
   engine.GetEngineInfo();
   engine.Run(context);
+  phi::DenseTensor* output0 = engine.GetOutput("output_0");
+  phi::DenseTensor* output1 = engine.GetOutput("output_1");
 
   std::vector<float> output_data1(inference_options.batch * 1 * 16 * 16, 0);
   std::vector<float> output_data2(inference_options.batch * 2 * 16 * 16, 0);
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data1.data(),
                        place,
-                       output.data<float>(),
+                       output0->data<float>(),
                        sizeof(float) * output_data1.size(),
                        context.stream());
   paddle::memory::Copy(phi::CPUPlace(),
                        output_data2.data(),
                        place,
-                       output2.data<float>(),
+                       output1->data<float>(),
                        sizeof(float) * output_data2.size(),
                        context.stream());
   cudaStreamSynchronize(context.stream());
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
index 232653e8c41f7..43d356b6d6983 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -21,6 +21,7 @@
 #include "paddle/phi/backends/dynload/tensorrt.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
 namespace backends {
@@ -235,10 +236,20 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
   return true;
 }
 
+void TrtEngine::PrepareOutputHandle(const std::string& out_name) {
+  phi::DenseTensor t;
+  outputs_.emplace(out_name, t);
+}
+
+phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) {
+  return &outputs_[name];
+}
+
+size_t TrtEngine::GetOutputNum() const { return outputs_.size(); }
+
 bool TrtEngine::SetUpInference(
     const InferenceOptions& inference,
-    const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
-    std::unordered_map<std::string, phi::DenseTensor*>* outputs) {
+    const std::unordered_map<std::string, phi::DenseTensor*>& inputs) {
   // TODO(wilber): now only create one exec_context
   FreshDeviceId();
   CHECK(engine_ != nullptr);
@@ -252,10 +263,10 @@ bool TrtEngine::SetUpInference(
     bindings_.front()->AddBinding(
         bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT);
   }
-  for (auto& it : *outputs) {
+  for (auto& it : outputs_) {
     const int bind_index = engine_->getBindingIndex(it.first.c_str());
     bindings_.front()->AddBinding(
-        bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT);
+        bind_index, it.first, false, &it.second, nvinfer1::DataType::kFLOAT);
   }
 
   return true;
@@ -290,11 +301,13 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) {
     const int bind_index = engine_->getBindingIndex(bind.name.c_str());
     std::vector<int32_t> ddim;
     auto dims = engine_->getBindingDimensions(bind_index);
+    CHECK_NE(runtime_batch, -1) << "runtime_batch should not be -1.";
     ddim.push_back(runtime_batch);
     for (int i = 0; i < dims.nbDims; ++i) {
       ddim.push_back(dims.d[i]);
     }
     bind.buffer->Resize(phi::make_ddim(ddim));
+    // TODO(wilber): now only support float output.
     ctx.Alloc<float>(bind.buffer, sizeof(float) * bind.buffer->numel());
     buffers[bind_index] = static_cast<void*>(bind.buffer->data<float>());
   }
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
index 3c8243e3c3838..a26474f8cbb35 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -81,11 +81,17 @@ class TrtEngine {
   // TODO(wilber): How to support multiple execution contexts?
   bool SetUpInference(
       const InferenceOptions& inference,
-      const std::unordered_map<std::string, phi::DenseTensor*>& inputs,
-      std::unordered_map<std::string, phi::DenseTensor*>* outputs);
+      const std::unordered_map<std::string, phi::DenseTensor*>& inputs);
 
   void GetEngineInfo();
 
+  void PrepareOutputHandle(const std::string& out_name);
+
+  // TODO(wilber): The output tensor names are: output_0, output_1, ...
+  phi::DenseTensor* GetOutput(const std::string&);
+
+  size_t GetOutputNum() const;
+
  private:
   void FreshDeviceId();
 
@@ -112,6 +118,7 @@ class TrtEngine {
   std::vector<std::unique_ptr<Bindings>> bindings_;
   int device_id_{0};
   bool is_dynamic_shape_{false};
+  std::unordered_map<std::string, phi::DenseTensor> outputs_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index a3f2d0afafc41..cf3906c32e559 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -7,16 +7,10 @@ gather_srcs(infrt_src SRCS
     dense_tensor.cc
     mlir_loader.cc
     diagnostic_utils.cc
-    pd_ops.cc
     )
 
 mlir_tablegen_on(tensor_shape DIALECT ts)
 mlir_tablegen_on(dense_tensor DIALECT dt)
-mlir_tablegen_on(pd_op_base DIALECT pd)
-mlir_tablegen_on(pd_ops)
-mlir_tablegen_on(pd_extra_ops)
-
-mlir_add_rewriter(rewrite)
 
 # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
 add_executable(infrtopt opt.cc)
@@ -24,10 +18,10 @@ target_link_libraries(infrtopt infrt)
 
 add_executable(print-ir print_ir.cc)
 target_link_libraries(print-ir infrt ${mlir_libs})
-add_dependencies(print-ir pd_ops_inc)
 cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS})
 
 add_subdirectory(infrt)
+add_subdirectory(pd)
 add_subdirectory(tensorrt)
 
 if (INFRT_WITH_PHI)
diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td
index 666c7b300af33..822a4879e6f59 100644
--- a/paddle/infrt/dialect/dense_tensor.td
+++ b/paddle/infrt/dialect/dense_tensor.td
@@ -106,7 +106,7 @@ def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> {
 
   // input path of model params.
   let arguments = (ins StrAttr:$path);
-  let results = (outs DenseTensorMap:$out);
+  let results = (outs DenseHostTensorMap:$out);
 
   let assemblyFormat = "`(``)`attr-dict";
 }
@@ -121,7 +121,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
 
   // input path of model params.
   let arguments = (ins
-          DenseTensorMap:$map,
+          DenseHostTensorMap:$map,
           StrAttr:$name
           );
   let results = (outs DenseTensor:$output);
@@ -130,17 +130,43 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> {
 }
 
 def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> {
-  let summary = "ddt.tensor_map_get_size operation";
+  let summary = "dt.tensor_map_get_size operation";
 
   let description = [{
     An operation that get the size of a TensorMap.
   }];
 
-  let arguments = (ins DenseTensorMap:$map);
+  let arguments = (ins DenseHostTensorMap:$map);
   let results = (outs I32:$size);
   let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
 }
 
+def Infrt_TensorListGetTensorOp : DT_Op<"tensor_list_get_tensor", [NoSideEffect]> {
+  let summary = "dt.tensor_list_get_tensor operation";
+
+  let description = [{
+    An operation that can get a tensor from a TensorList.
+  }];
+
+  let arguments = (ins
+          DenseTensorList:$l,
+          I32Attr:$id
+          );
+  let results = (outs DenseTensor:$output);
+  let verifier = ?;
+}
+
+def TensorListGetSizeOp : DT_Op<"tensor_list_get_size", [NoSideEffect]> {
+  let summary = "dt.tensor_list_get_size operation";
+
+  let description = [{
+    An operation that get the size of a TensorList.
+  }];
+
+  let arguments = (ins DenseTensorList:$map);
+  let results = (outs I32:$size);
+}
+
 def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> {
   let summary = "dt.get_tensor_shape operation";
 
diff --git a/paddle/infrt/dialect/infrt/common/types.cc b/paddle/infrt/dialect/infrt/common/types.cc
index 62419a196288b..c10679b01342f 100644
--- a/paddle/infrt/dialect/infrt/common/types.cc
+++ b/paddle/infrt/dialect/infrt/common/types.cc
@@ -30,6 +30,8 @@ llvm::Optional<LayoutType> GetLayoutType(llvm::StringRef key) {
     return LayoutType::NCHW;
   else if (key.equals_insensitive("NHWC"))
     return LayoutType::NHWC;
+  else if (key.equals_insensitive("ANY"))
+    return LayoutType::ANY;
   else
     return llvm::None;
 }
@@ -39,6 +41,8 @@ llvm::Optional<PrecisionType> GetPrecisionType(llvm::StringRef key) {
     return PrecisionType::FLOAT32;
   else if (key.equals_insensitive("FP16"))
     return PrecisionType::FLOAT16;
+  else if (key.equals_insensitive("UNK"))
+    return PrecisionType::UNK;
   else
     return llvm::None;
 }
@@ -67,6 +71,9 @@ llvm::StringRef GetString(LayoutType type) {
     case (LayoutType::NHWC):
       str = "NHWC";
       break;
+    case (LayoutType::ANY):
+      str = "ANY";
+      break;
     default:
       str = "Unsupported";
   }
@@ -82,6 +89,9 @@ llvm::StringRef GetString(PrecisionType type) {
     case (PrecisionType::FLOAT16):
       str = "FP16";
       break;
+    case (PrecisionType::UNK):
+      str = "UNK";
+      break;
     default:
       str = "Unsupported";
   }
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_base.td b/paddle/infrt/dialect/infrt/ir/infrt_base.td
index c5130e89bb13a..9b1d2132292df 100644
--- a/paddle/infrt/dialect/infrt/ir/infrt_base.td
+++ b/paddle/infrt/dialect/infrt/ir/infrt_base.td
@@ -83,7 +83,14 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
   );
 }
 
-def DenseTensorMap :  Infrt_Type<"DenseTensorMap"> {
+def DenseHostTensorMap :  Infrt_Type<"DenseHostTensorMap"> {
+  let summary = "infrt dense tensor map";
+  let description = [{dense_tensor map}];
+  let parameters = (ins);
+}
+
+// TODO(wilber): Add !infrt.vec type.
+def DenseTensorList :  Infrt_Type<"DenseTensorList"> {
   let summary = "infrt dense tensor map";
   let description = [{dense_tensor map}];
   let parameters = (ins);
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
index 42de08ebc4193..eb69a95c583f2 100644
--- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
+++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc
@@ -90,6 +90,9 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
     return LoDTensorType::get(
         parser.getContext(), shape, elementType, lod_level);
   }
+  if (keyword == "dense_tensor_map") {
+    return DenseHostTensorMapType::get(parser.getContext());
+  }
   if (keyword == "dense_tensor") {
     // parse DenseTensor, for example: !i=Infrt.tensor<X86, CUDA, F32>
     llvm::StringRef target;
@@ -134,6 +137,11 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const {
     return DenseTensorType::get(
         parser.getContext(), *targetType, *precisionType, *layoutType);
   }
+
+  if (keyword == "tensor_list") {
+    return infrt::DenseTensorListType::get(parser.getContext());
+  }
+
   // Todo: parse other type
   return mlir::Type();
 }
@@ -154,9 +162,13 @@ void InfrtDialect::printType(::mlir::Type type,
        << lod_tensor_type.getLod_level() << ">";
     return;
   }
+  if (type.isa<infrt::DenseHostTensorMapType>()) {
+    os << "dense_tensor_map";
+    return;
+  }
 
   // print DenseTensorType, for example: !infrt.dense_tensor<CPU, FP32, NCHW>
-  if (type.isa<infrt::DenseTensorType>()) {
+  if (type.isa<DenseTensorType>()) {
     auto dense_tensor_type = type.cast<infrt::DenseTensorType>();
     os << "dense_tensor<" << dense_tensor_type.getTarget() << ", "
        << dense_tensor_type.getPrecision() << ", "
@@ -164,6 +176,10 @@ void InfrtDialect::printType(::mlir::Type type,
     return;
   }
 
+  if (type.isa<infrt::DenseTensorListType>()) {
+    os << "tensor_list";
+    return;
+  }
   llvm_unreachable("unknown infrt type.");
 }
 
diff --git a/paddle/infrt/dialect/infrt/ir/infrt_ops.td b/paddle/infrt/dialect/infrt/ir/infrt_ops.td
index f5430b03d0d75..82eba2a1746cc 100644
--- a/paddle/infrt/dialect/infrt/ir/infrt_ops.td
+++ b/paddle/infrt/dialect/infrt/ir/infrt_ops.td
@@ -53,9 +53,9 @@ def Infrt_CallOp : Infrt_Op<"call"> {
   }];
 }
 
-def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> {
-  let summary = "convert tensor type op";
-  let description = [{convert tensor type op!}];
+def Infrt_TensorCastOp : Infrt_Op<"tensor_cast", [NoSideEffect]> {
+  let summary = "cast tensor type op";
+  let description = [{cast tensor type op!}];
   let arguments = (ins AnyType:$input);
   let results = (outs AnyType:$output);
 }
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
index 51addb4deb438..3d825a9c762f4 100644
--- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td
@@ -3,19 +3,19 @@
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "paddle/infrt/dialect/infrt/ir/infrt_ops.td"
-include "paddle/infrt/dialect/pd_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
 
-def FuseCvtTensorPattern : Pat<
-       (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)),
-       (Infrt_CvtTensorOp $arg)>;
+def FuseTensorCastPattern : Pat<
+       (Infrt_TensorCastOp (Infrt_TensorCastOp $arg)),
+       (Infrt_TensorCastOp $arg)>;
 
-def FuseFeedCvtTensorPattern : Pat<
-       (Infrt_CvtTensorOp (PD_FeedOp $name)),
+def FuseFeedTensorCastPattern : Pat<
+       (Infrt_TensorCastOp (PD_FeedOp $name)),
        (PD_FeedOp $name)>;
 
 def TypesAreIdentical : Constraint<CPred<"$0.getType() == $1.getType()">>;
-def RedundantCvtTensorOptPattern : Pat<
-  (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg),
+def RedundantTensorCastOptPattern : Pat<
+  (Infrt_TensorCastOp:$res $arg), (replaceWithValue $arg),
   [(TypesAreIdentical $res, $arg)]>;
 
 
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
index 25ecf2ae99dc3..eec0e0bc7c5ab 100644
--- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
+++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc
@@ -16,7 +16,7 @@
 
 #include <mlir/Transforms/GreedyPatternRewriteDriver.h>
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 namespace {
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc"  // NOLINT
 
@@ -27,8 +27,12 @@ struct InfrtOpFusePass
     : public mlir::PassWrapper<InfrtOpFusePass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "infrtOpFusePass"; }
+
+  llvm::StringRef getArgument() const override { return "infrt-op-fuse"; }
+
   void runOnFunction() override;
 };
+
 // Implementation of the InfrtOpFusePass.
 void InfrtOpFusePass::runOnFunction() {
   ::mlir::RewritePatternSet patterns(&getContext());
@@ -39,14 +43,18 @@ void InfrtOpFusePass::runOnFunction() {
   if (nullptr == terminator_op) return;
   for (auto operand : terminator_op->getOperands()) {
     auto *op1 = operand.getDefiningOp();
-    auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1);
+    auto cvt_op = ::llvm::dyn_cast<::infrt::TensorCastOp>(op1);
     if (!cvt_op) continue;
     mlir::Value value = cvt_op.input();
     operand.replaceAllUsesWith(value);
     cvt_op.erase();
   }
 }
+
 }  // namespace
+
 std::unique_ptr<mlir::Pass> infrt::createInfrtOpFusePass() {
   return std::make_unique<InfrtOpFusePass>();
 }
+
+mlir::PassRegistration<InfrtOpFusePass> infrt_op_fuse_pass;
diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc
index 0c5944ebf8475..56c375c72d2bb 100644
--- a/paddle/infrt/dialect/init_dialects.cc
+++ b/paddle/infrt/dialect/init_dialects.cc
@@ -20,20 +20,23 @@
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
 
 #include "paddle/infrt/dialect/tensor_shape.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
   registry.insert<ts::TensorShapeDialect,
                   InfrtDialect,
                   dt::DTDialect,
-                  mlir::pd::PaddleDialect,
+                  pd::PaddleDialect,
+                  trt::TensorRTDialect
 #ifdef INFRT_WITH_PHI
+                  ,
                   phi::PHIDenseTensorDialect,
                   phi::PHICPUKernelDialect,
                   phi::PHIGPUKernelDialect,
diff --git a/paddle/infrt/dialect/pd/CMakeLists.txt b/paddle/infrt/dialect/pd/CMakeLists.txt
new file mode 100644
index 0000000000000..5f65336453fbd
--- /dev/null
+++ b/paddle/infrt/dialect/pd/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(common)
+add_subdirectory(ir)
+add_subdirectory(pass)
diff --git a/paddle/infrt/dialect/pd/common/CMakeLists.txt b/paddle/infrt/dialect/pd/common/CMakeLists.txt
new file mode 100644
index 0000000000000..ee1b0d4c30deb
--- /dev/null
+++ b/paddle/infrt/dialect/pd/common/CMakeLists.txt
@@ -0,0 +1,4 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    )
diff --git a/paddle/infrt/dialect/pd/ir/CMakeLists.txt b/paddle/infrt/dialect/pd/ir/CMakeLists.txt
new file mode 100644
index 0000000000000..8aacfc97623c0
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/CMakeLists.txt
@@ -0,0 +1,7 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    pd_ops.cc
+    )
+add_mlir_dialect(pd_ops pd)
+mlir_tablegen_on(pd_extra_ops)
diff --git a/paddle/infrt/dialect/pd_extra_ops.td b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td
similarity index 90%
rename from paddle/infrt/dialect/pd_extra_ops.td
rename to paddle/infrt/dialect/pd/ir/pd_extra_ops.td
index c6d3f530455f7..cf17db211cbe9 100644
--- a/paddle/infrt/dialect/pd_extra_ops.td
+++ b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td
@@ -4,7 +4,7 @@
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/pd_op_base.td"
+include "paddle/infrt/dialect/pd/ir/pd_op_base.td"
 
 def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> {
     let summary = "Computes the Fully Connected result of two tensors";
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd/ir/pd_op_base.td
similarity index 85%
rename from paddle/infrt/dialect/pd_op_base.td
rename to paddle/infrt/dialect/pd/ir/pd_op_base.td
index f6af4c83aed8b..e28854a848023 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd/ir/pd_op_base.td
@@ -8,7 +8,7 @@ include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 
-def PD_Dialect : Dialect {
+def Paddle_Dialect : Dialect {
   let name = "pd";
 
   let description = [{
@@ -16,16 +16,16 @@ def PD_Dialect : Dialect {
 
     This dialect contains the PaddlePaddle operators.
   }];
-
-  let cppNamespace = "mlir::pd";
+  let hasConstantMaterializer = 1;
+  let cppNamespace = "infrt::pd";
 }
 
 class PD_Op<string mnemonic, list<OpTrait> traits = []> :
-      Op<PD_Dialect, mnemonic, traits>;
+      Op<Paddle_Dialect, mnemonic, traits>;
 
 
 class PD_PaddleAttr <string name, string description> :
-      Attr<CPred<"$_self.isa<mlir::pd::" # name # "Attr>()">,
+      Attr<CPred<"$_self.isa<infrt::pd::" # name # "Attr>()">,
           "PaddlePaddle " # description # " attribute">;
 
 
@@ -33,12 +33,12 @@ class PD_PaddleAttr <string name, string description> :
 // PaddlePaddle type definitions
 //===----------------------------------------------------------------------===//
 
-def PD_PDDialectType : Type<CPred<"$_self.isa<mlir::pd::PDType>()">, "PaddlePaddle type">;
+def PD_PDDialectType : Type<CPred<"$_self.isa<infrt::pd::PDType>()">, "PaddlePaddle type">;
 
 class PD_PaddleType <string name, string description> :
-      Type<CPred<"$_self.isa<mlir::pd::" # name #"Type>()">,
+      Type<CPred<"$_self.isa<infrt::pd::" # name #"Type>()">,
          "Paddle " # description # " type">,
-      BuildableType<"getType<mlir::pd::" # name # "Type>()">;
+      BuildableType<"getType<infrt::pd::" # name # "Type>()">;
 
 //===----------------------------------------------------------------------===//
 // Integer types
diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.cc b/paddle/infrt/dialect/pd/ir/pd_ops.cc
new file mode 100644
index 0000000000000..b5ba48581ee62
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/pd_ops.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/PatternMatch.h>
+
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.cpp.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc"  // NOLINT
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc"  // NOLINT
+
+namespace infrt {
+namespace pd {
+void PaddleDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc"  // NOLINT
+      ,
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc"  // NOLINT
+      >();
+}
+
+mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
+                                                    mlir::Attribute value,
+                                                    mlir::Type type,
+                                                    mlir::Location loc) {
+  return builder.create<ConstantOp>(loc, value);
+}
+
+void ConstantOp::build(mlir::OpBuilder &builder,
+                       mlir::OperationState &state,
+                       mlir::Attribute value) {
+  if (auto elem_attr = value.dyn_cast<mlir::ElementsAttr>()) {
+    return ConstantOp::build(builder, state, elem_attr);
+  } else if (value.isa<mlir::BoolAttr, mlir::FloatAttr, mlir::IntegerAttr>()) {
+    mlir::ShapedType type =
+        mlir::RankedTensorType::get(/*shape=*/{}, value.getType());
+    state.addAttribute("value", mlir::DenseElementsAttr::get(type, value));
+    state.addTypes(type);
+    return;
+  }
+  llvm_unreachable("unsupported attribute type for building pd.constant");
+}
+
+mlir::LogicalResult ConstantOp::inferReturnTypes(
+    mlir::MLIRContext *context,
+    mlir::Optional<mlir::Location> location,
+    mlir::ValueRange operands,
+    mlir::DictionaryAttr attributes,
+    mlir::RegionRange regions,
+    llvm::SmallVectorImpl<mlir::Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(attributes.get("value").getType());
+  return mlir::success();
+}
+mlir::OpFoldResult ConstantOp::fold(
+    ::llvm::ArrayRef<mlir::Attribute> operands) {
+  return value();
+}
+}  // namespace pd
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.h b/paddle/infrt/dialect/pd/ir/pd_ops.h
new file mode 100644
index 0000000000000..8383ff6ed8201
--- /dev/null
+++ b/paddle/infrt/dialect/pd/ir/pd_ops.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+//===----------------------------------------------------------------------===//
+// Dialect
+//===----------------------------------------------------------------------===//
+#include <llvm/ADT/StringMap.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.hpp.inc"
diff --git a/paddle/infrt/dialect/pd/pass/CMakeLists.txt b/paddle/infrt/dialect/pd/pass/CMakeLists.txt
new file mode 100644
index 0000000000000..827df597b76e2
--- /dev/null
+++ b/paddle/infrt/dialect/pd/pass/CMakeLists.txt
@@ -0,0 +1,8 @@
+
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    pd_op_fuse_pass.cc
+    )
+
+mlir_add_rewriter(pd_op_fuse)
diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td
similarity index 97%
rename from paddle/infrt/dialect/rewrite.td
rename to paddle/infrt/dialect/pd/pass/pd_op_fuse.td
index 62e7471a390df..f5a8ea78d7d9d 100644
--- a/paddle/infrt/dialect/rewrite.td
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td
@@ -3,8 +3,8 @@
 
 include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "paddle/infrt/dialect/pd_ops.td"
-include "paddle/infrt/dialect/pd_extra_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_extra_ops.td"
 
 //===----------------------------------------------------------------------===//
 // This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'.
diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
new file mode 100644
index 0000000000000..8bdf957db27d8
--- /dev/null
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h"  // NOLINT
+
+#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
+namespace {
+#include "paddle/infrt/dialect/pd/pass/pd_op_fuse.cpp.inc"  // NOLINT
+
+/*
+ * PdOpFusePass.
+ */
+struct PdOpFusePass
+    : public mlir::PassWrapper<PdOpFusePass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "PdOpFusePass"; }
+
+  llvm::StringRef getArgument() const override { return "pd-op-fuse"; }
+
+  void runOnFunction() override;
+};
+
+// Implementation of the PdOpFusePass.
+void PdOpFusePass::runOnFunction() {
+  ::mlir::RewritePatternSet patterns(&getContext());
+  populateWithGenerated(patterns);
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+}
+
+}  // namespace
+
+mlir::PassRegistration<PdOpFusePass> infrt_op_fuse_pass;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h
similarity index 69%
rename from paddle/fluid/operators/reduce_ops/reduce_all_op.cu
rename to paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h
index a1f1a228aeb3a..854545ab1a263 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+#pragma once
+#include <mlir/Pass/Pass.h>
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_all,
-    ops::ReduceCudaKernel<bool, kps::LogicalAndFunctor, kps::IdentityFunctor>);
+namespace infrt {
+/*
+ * PdOpFusePass.
+ */
+std::unique_ptr<mlir::Pass> CreatePdOpFusePass();
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
deleted file mode 100644
index 96e9e307f2fd3..0000000000000
--- a/paddle/infrt/dialect/pd_ops.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/pd_ops.h"
-
-#include <mlir/IR/Matchers.h>
-#include <mlir/IR/PatternMatch.h>
-
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc"  // NOLINT
-
-namespace mlir {
-namespace pd {
-
-#include "paddle/infrt/dialect/rewrite.cpp.inc"  // NOLINT
-
-PaddleDialect::PaddleDialect(MLIRContext *context)
-    : Dialect("pd", context, TypeID::get<PaddleDialect>()) {
-  addOperations<
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
-      ,
-#define GET_OP_LIST
-#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc"  // NOLINT
-      >();
-}
-
-mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
-                                                    mlir::Attribute value,
-                                                    mlir::Type type,
-                                                    mlir::Location loc) {
-  return builder.create<ConstantOp>(loc, value);
-}
-
-void ConstantOp::build(OpBuilder &builder,
-                       OperationState &state,
-                       Attribute value) {
-  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
-    return ConstantOp::build(builder, state, elem_attr);
-  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
-    ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType());
-    state.addAttribute("value", DenseElementsAttr::get(type, value));
-    state.addTypes(type);
-    return;
-  }
-  llvm_unreachable("unsupported attribute type for building pd.constant");
-}
-
-LogicalResult ConstantOp::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(attributes.get("value").getType());
-  return success();
-}
-mlir::OpFoldResult ConstantOp::fold(
-    ::llvm::ArrayRef<mlir::Attribute> operands) {
-  return value();
-}
-/*
-LogicalResult ElementwiseAdd::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-*/
-
-void Elementwise_addOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseMulAdd>(context);
-}
-
-/*
-mlir::OpFoldResult ElementwiseAdd::fold(
-    llvm::ArrayRef<mlir::Attribute> operands) {
-  if (getElementTypeOrSelf(getType()).isa<FloatType>()) {
-    if (!operands[0] || !operands[1]) return {};
-    DenseElementsAttr lhs = operands[0].dyn_cast<DenseElementsAttr>();
-    DenseElementsAttr rhs = operands[1].dyn_cast<DenseElementsAttr>();
-    if (!lhs || !rhs) return {};
-    ShapedType type = getType().template cast<ShapedType>();
-    if (!type.hasStaticShape()) return {};
-    Type etype = type.getElementType();
-    if (!etype.isa<FloatType>()) return {};
-    SmallVector<APFloat, 6> values;
-    values.reserve(lhs.getNumElements());
-    for (const auto zip :
-         llvm::zip(lhs.getValues<APFloat>(), rhs.getValues<APFloat>())) {
-      values.push_back(
-          std::plus<APFloat>()(std::get<0>(zip), std::get<1>(zip)));
-    }
-    return DenseElementsAttr::get(type, values);
-  }
-  return {};
-}
-
-LogicalResult ElementwiseDiv::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult ElementwiseMul::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult ElementwiseSub::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-LogicalResult MulOp::inferReturnTypes(
-    MLIRContext *context,
-    Optional<Location> location,
-    ValueRange operands,
-    DictionaryAttr attributes,
-    RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands[0].getType());
-  return success();
-}
-
-void ReluOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseFCRelu>(context);
-}
-
-void FusedRepeatedFCRelu::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseRepeatedFCRelu2>(context);
-}
-
-void BatchNormOp::getCanonicalizationPatterns(
-    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
-  results.insert<FuseBatchNormWithConvPattern>(context);
-}*/
-
-}  // namespace pd
-}  // namespace mlir
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
deleted file mode 100644
index e6b0f30c05905..0000000000000
--- a/paddle/infrt/dialect/pd_ops.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <mlir/Dialect/Traits.h>
-#include <mlir/IR/Attributes.h>
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/BuiltinOps.h>
-#include <mlir/IR/BuiltinTypes.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/IR/Matchers.h>
-#include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/TypeUtilities.h>
-#include <mlir/Interfaces/CallInterfaces.h>
-#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
-#include <mlir/Interfaces/InferTypeOpInterface.h>
-#include <mlir/Interfaces/LoopLikeInterface.h>
-#include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-
-namespace mlir {
-namespace pd {
-
-class PaddleDialect : public Dialect {
- public:
-  explicit PaddleDialect(MLIRContext* context);
-
-  static StringRef getDialectNamespace() { return "pd"; }
-
-  /// A hook used to materialize constant values with the given type.
-  Operation* materializeConstant(OpBuilder& builder,
-                                 Attribute value,
-                                 Type type,
-                                 Location loc) override;
-
-  Type parseType(DialectAsmParser& parser) const override {
-    return Dialect::parseType(parser);
-  }
-  void printType(Type type, DialectAsmPrinter& printer) const override {
-    Dialect::printType(type, printer);
-  }
-};
-
-}  // namespace pd
-}  // namespace mlir
-
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.hpp.inc"
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_extra_ops.hpp.inc"
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
index 4e73a533d99a7..436ff0a40480c 100644
--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -8,6 +8,7 @@ add_subdirectory(pass)
 add_executable(phi-ir-exec phi_ir_exec.cc)
 target_link_libraries(phi-ir-exec infrt)
 
+
 add_executable(phi-exec phi_exec.cc)
 target_link_libraries(phi-exec infrt)
 
diff --git a/paddle/infrt/dialect/phi/data_type.h b/paddle/infrt/dialect/phi/data_type.h
index bd258cb103879..8e831c8c27d50 100644
--- a/paddle/infrt/dialect/phi/data_type.h
+++ b/paddle/infrt/dialect/phi/data_type.h
@@ -23,16 +23,16 @@
 
 namespace infrt {
 
-phi::Backend ConvertTargetToPhi(TargetType target);
-TargetType ConvertTargetFromPhi(phi::Backend backend);
+::phi::Backend ConvertTargetToPhi(TargetType target);
+TargetType ConvertTargetFromPhi(::phi::Backend backend);
 
-phi::DataType ConvertPrecisionToPhi(PrecisionType precision);
-PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype);
+::phi::DataType ConvertPrecisionToPhi(PrecisionType precision);
+PrecisionType ConvertPrecisionFromPhi(::phi::DataType datatype);
 
-phi::DataLayout ConvertLayoutToPhi(LayoutType layout);
-LayoutType ConvertLayoutFromPhi(phi::DataLayout layout);
+::phi::DataLayout ConvertLayoutToPhi(LayoutType layout);
+LayoutType ConvertLayoutFromPhi(::phi::DataLayout layout);
 
-phi::KernelKey ConvertPlaceToPhi(const Place& place);
-Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg);
+::phi::KernelKey ConvertPlaceToPhi(const Place& place);
+Place ConvertPlaceFromPhi(::phi::TensorArgDef tensor_arg);
 
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
index 5d7338ec4292e..376d62deecee7 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
@@ -18,8 +18,8 @@ def PHI_Dialect : Dialect {
 
 def PhiOpTrait : NativeOpTrait<"PhiOpTrait">;
 
-class PHI_Type<string type, list<Trait> traits = []>
-   : TypeDef<PHI_Dialect, type, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
+class PHI_Type<string type, list<Trait> traits = [], string baseCppClass = "::mlir::Type">
+   : TypeDef<PHI_Dialect, type, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove]), baseCppClass> {}
 
 def Allocator : PHI_Type<"Allocator"> {
    let mnemonic = "allocator";
@@ -37,4 +37,8 @@ def Allocator : PHI_Type<"Allocator"> {
    let assemblyFormat = "`<` $target `>`";
  }
 
+def PD_DenseTensorMap : PHI_Type<"DenseTensorMap"> {
+  let mnemonic = "dense_tensor_map";
+}
+
 #endif
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index 8c3a79498d74d..3af7033d2f4c7 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -21,8 +21,8 @@ def PHI_DenseTensorDialect : Dialect {
 class PDT_Op<string mnemonic, list<OpTrait> traits = []> : Op<PHI_DenseTensorDialect,
   mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {}
 
-class CreateDenseTensorOp 
-      : PDT_Op<"create_dense_tensor", [NoSideEffect]> {
+class CreateDenseTensorOp<string target>
+      : PDT_Op<"create_dense_tensor." # target, [NoSideEffect]> {
   let arguments = (ins Context:$context, I64ArrayAttr:$dims, 
     LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision);
   let results = (outs DenseTensor:$output);
@@ -51,10 +51,46 @@ class CreateContextOp<string target>
   let results = (outs Context:$output);
 }
 
-def PDT_CreateDenseTensorOp : CreateDenseTensorOp;
+def PDT_LoadParamsOp : PDT_Op<"load_params", [NoSideEffect]> {
+  // input path of model params.
+  let arguments = (ins StrAttr:$path);
+  let results = (outs PD_DenseTensorMap:$out);
+
+  let assemblyFormat = "`(``)`attr-dict";
+}
+
+def PDT_LoadCombinedParamsOp : PDT_Op<"load_combined_params", [NoSideEffect]> {
+  // input path of model params.
+  let arguments = (ins StrAttr:$model_path, StrAttr:$params_path);
+  let results = (outs PD_DenseTensorMap:$out);
+
+  let assemblyFormat = "`(``)`attr-dict";
+}
+
+def PDT_TensorMapGetSizeOp : PDT_Op<"tensor_map_get_size", [NoSideEffect]> {
+  let arguments = (ins PD_DenseTensorMap:$map);
+  let results = (outs I32:$size);
+  let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)";
+}
+
+class TensorMapGetTensorOp:
+      PDT_Op<"tensor_map_get_tensor"> {
+  let arguments = (ins
+          PD_DenseTensorMap:$map,
+          StrAttr:$name
+          );
+  let results = (outs DenseTensor:$output);
+  let assemblyFormat = "`(` operands `)` attr-dict `->` type($output)";
+  let verifier = ?;
+}
+
+def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">;
+def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">;
 def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
 def PDT_CreateCPUContextOp : CreateContextOp<"cpu">;
+def PDT_CreateGPUContextOp : CreateContextOp<"gpu">;
 def PDT_PrintDenseTensor : PrintDenseTensorOp;
+def PDT_TensorMapGetTensorOp: TensorMapGetTensorOp;
 
 def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
   let arguments = (ins Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
index d8095d7f3f13f..f91381fe72903 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -29,6 +29,7 @@ namespace infrt {
 namespace phi {
 
 void PHIDialect::initialize() {
+  LOG(INFO) << "PHI Dialect initalized";
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/pass/CMakeLists.txt b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
index 5c55a6b0acaed..dc60ecf63fe2e 100644
--- a/paddle/infrt/dialect/phi/pass/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/pass/CMakeLists.txt
@@ -2,6 +2,8 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     proto_arg_map_context.cc
-    phi_op_cvt_pass.cc
+    phi_op_convert_pass.cc
     kernel_op_desc.cc
-    )
+   )
+
+cc_test(test_kernel_op_desc SRCS kernel_op_desc_test.cc DEPS infrt)
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index 353b1054e7137..a26e8e2dca570 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -73,7 +73,7 @@ std::string getPhiLayoutSuffix(LayoutType layout) {
   }
 }
 
-std::vector<PhiKernelDesc> getCandidateKernels(
+std::vector<PhiKernelDesc> GetCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces) {
   std::vector<PhiKernelDesc> candidate_kernels;
   PhiKernelDesc phi_kernel_desc;
@@ -88,19 +88,20 @@ std::vector<PhiKernelDesc> getCandidateKernels(
       if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue;
       place.layout = LayoutType::ANY;
     }
-    phi_kernel_desc.kernelType = place;
-    phi_kernel_desc.inputsType.clear();
-    phi_kernel_desc.outputsType.clear();
+    phi_kernel_desc.kernel_type = place;
+    phi_kernel_desc.input_types.clear();
+    phi_kernel_desc.output_types.clear();
     phi::KernelArgsDef args_def = kernel_key_map.at(kernel_key).args_def();
     const paddle::SmallVector<phi::TensorArgDef>& input_arg =
         args_def.input_defs();
     const paddle::SmallVector<phi::TensorArgDef>& output_arg =
         args_def.output_defs();
     for (auto tensor_arg : input_arg) {
-      phi_kernel_desc.inputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg));
+      phi_kernel_desc.input_types.emplace_back(ConvertPlaceFromPhi(tensor_arg));
     }
     for (auto tensor_arg : output_arg) {
-      phi_kernel_desc.outputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg));
+      phi_kernel_desc.output_types.emplace_back(
+          ConvertPlaceFromPhi(tensor_arg));
     }
     candidate_kernels.emplace_back(phi_kernel_desc);
   }
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
index b1f7c6c0811de..cdc8f7cbff553 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h
@@ -21,16 +21,16 @@
 namespace infrt {
 
 struct PhiKernelDesc {
-  std::vector<Place> inputsType;   // kernel input place
-  std::vector<Place> outputsType;  // kernel output place
-  Place kernelType;                // kernel place
+  std::vector<Place> input_types;   // kernel input place
+  std::vector<Place> output_types;  // kernel output place
+  Place kernel_type;                // kernel place
 };
 
 std::string getPhiTargetPrefix(TargetType target);
 std::string getPhiPrecisionSuffix(PrecisionType precision);
 std::string getPhiLayoutSuffix(LayoutType layout);
 
-std::vector<PhiKernelDesc> getCandidateKernels(
+std::vector<PhiKernelDesc> GetCandidateKernels(
     std::string name, const std::vector<Place>& valid_palces);
 
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
new file mode 100644
index 0000000000000..bd5f0799a60d5
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+#include "paddle/phi/kernels/declarations.h"
+
+namespace infrt {
+
+TEST(phi, get_op_desc) {
+  std::vector<Place> places;
+  places.emplace_back(
+      TargetType::CPU, PrecisionType::FLOAT32, LayoutType::NCHW);
+  auto kernels = GetCandidateKernels("addmm", places);
+  ASSERT_GE(kernels.size(), 1UL);
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
new file mode 100644
index 0000000000000..13cba6eeabb66
--- /dev/null
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
+
+#include <glog/logging.h>
+#include <llvm/ADT/SetVector.h>
+#include <mlir/Analysis/SliceAnalysis.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/OperationSupport.h>
+#include <list>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/infrt/common/string.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/ops/compat/signatures.h"
+
+namespace {
+class PhiOpConvertPass
+    : public mlir::PassWrapper<PhiOpConvertPass, mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "PhiOpConvertPass"; }
+  void runOnFunction() override;
+  PhiOpConvertPass();
+  explicit PhiOpConvertPass(const std::vector<infrt::Place> &valid_places)
+      : valid_places_(valid_places) {}
+
+  PhiOpConvertPass(const PhiOpConvertPass &other)
+      : mlir::PassWrapper<PhiOpConvertPass, mlir::FunctionPass>(*this),
+        valid_places_(other.valid_places_) {}
+
+  ::llvm::StringRef getArgument() const override { return "phi-op-convert"; }
+  void getDependentDialects(mlir::DialectRegistry &registry) const override;
+
+ private:
+  void convertStage();
+  void dispatchStage();
+
+  // Force a specified data format for all layout sensitive operations.
+  Option<std::string> valid_places_options_{
+      *this,
+      "valid-targets",
+      llvm::cl::desc("Set the valid target, [CPU-FP32-NCHW]")};
+
+  std::vector<infrt::Place> valid_places_;
+};
+// Implementation of the PhiOpConvertPass.
+void PhiOpConvertPass::runOnFunction() {
+  convertStage();
+  dispatchStage();
+}
+
+void PhiOpConvertPass::convertStage() {
+  mlir::Block &body = getFunction().front();
+  std::vector<mlir::Operation *> worklist;
+  for (auto &op : body.without_terminator()) {
+    worklist.push_back(&op);
+  }
+  mlir::OpBuilder builder(&body, body.begin());
+  while (!worklist.empty()) {
+    auto *op = worklist.back();
+    worklist.pop_back();
+    if (!op) continue;
+
+    auto op_name = op->getName().getIdentifier().str();
+
+    // only convert op in pd dialect.
+    if (op_name.substr(0, 3) != "pd.") continue;
+    op_name = op_name.substr(3);
+    if (pd_dialect_inputs_info_map_.find(op_name) ==
+            pd_dialect_inputs_info_map_.end() ||
+        pd_dialect_outputs_info_map_.find(op_name) ==
+            pd_dialect_outputs_info_map_.end()) {
+      LOG(WARNING) << "No op info found for " << op_name;
+      // Todo: print log
+      continue;
+    }
+    auto loc = getFunction().getLoc();
+    builder.setInsertionPoint(op);
+    if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) {
+      std::string kernel_name = phi::TransToPhiKernelName(op_name);
+      auto kernel_op = builder.create<infrt::KernelOp>(loc,
+                                                       op->getResultTypes(),
+                                                       op->getOperands(),
+                                                       kernel_name,
+                                                       op->getAttrDictionary());
+      op->replaceAllUsesWith(kernel_op.getResults());
+    } else {
+      ::phi::KernelSignature kernel_sign =
+          ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
+              infrt::ProtoArgumentMappingContext(op));
+      // resort input&output according to kernel_sign
+      ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
+      ::llvm::SmallVector<mlir::Type, 4> output_types;
+      for (const std::string &str : std::get<0>(kernel_sign.args)) {
+        if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
+          LOG(ERROR) << "No input info for Op " << op_name << " and argument "
+                     << str;
+          return;
+        }
+        uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
+        inputs.push_back(op->getOperands()[index]);
+      }
+
+      for (const std::string &str : std::get<2>(kernel_sign.args)) {
+        if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
+          LOG(ERROR) << "No output info for Op " << op_name << " and argument "
+                     << str;
+          return;
+        }
+        uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
+        output_types.push_back(op->getResultTypes()[index]);
+        ori_output.push_back(op->getResult(index));
+      }
+      auto kernel_op = builder.create<infrt::KernelOp>(
+          loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
+      for (size_t index = 0; index < ori_output.size(); ++index) {
+        ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
+      }
+    }
+    CHECK(op->use_empty());
+    op->erase();
+  }
+}
+
+void PhiOpConvertPass::dispatchStage() {
+  std::vector<infrt::KernelOp> worklist;
+  mlir::Block &block = getFunction().front();
+  for (auto &op : block) {
+    infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null<infrt::KernelOp>(&op);
+    if (nullptr != kernel_op) worklist.push_back(kernel_op);
+  }
+
+  mlir::OpBuilder builder(&block, block.begin());
+  std::map<infrt::TargetType, mlir::Value> phi_context;
+  for (infrt::KernelOp kernel_op : worklist) {
+    std::string kernel_name = kernel_op.name().str();
+    std::vector<infrt::PhiKernelDesc> candidates =
+        GetCandidateKernels(kernel_name, valid_places_);
+    if (candidates.empty()) {
+      LOG(FATAL) << "No candidate kernels for op:" << kernel_name;
+      continue;
+    }
+    builder.setInsertionPoint(kernel_op);
+
+    // Todo: Implimentation the concrete pass pick strategy
+    const infrt::PhiKernelDesc &phi_kernel_desc = candidates.front();
+
+    kernel_name =
+        infrt::getPhiTargetPrefix(phi_kernel_desc.kernel_type.target) +
+        kernel_name +
+        infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernel_type.precision) +
+        infrt::getPhiLayoutSuffix(phi_kernel_desc.kernel_type.layout);
+
+    mlir::OperationName operation_name(kernel_name, kernel_op.getContext());
+    mlir::OperationState operation_state(kernel_op.getLoc(), operation_name);
+
+    if (phi_context.find(phi_kernel_desc.kernel_type.target) ==
+        phi_context.end()) {
+      switch (phi_kernel_desc.kernel_type.target) {
+        case infrt::TargetType::CPU: {
+          auto context_value =
+              builder
+                  .create<infrt::phi::CreateCPUContextOp>(
+                      kernel_op.getLoc(),
+                      infrt::phi::ContextType::get(kernel_op.getContext(),
+                                                   infrt::TargetType::CPU))
+                  .output();
+          phi_context[infrt::TargetType::CPU] = context_value;
+        } break;
+        case infrt::TargetType::GPU:
+        case infrt::TargetType::UNK:
+        default:
+          LOG(FATAL) << "Unsupported TargetType";
+          break;
+      }
+    }
+    operation_state.addOperands(
+        phi_context.at(phi_kernel_desc.kernel_type.target));
+
+    for (size_t index = 0; index < phi_kernel_desc.input_types.size();
+         ++index) {
+      mlir::Value input = kernel_op.getOperand(index);
+      auto cvt_tensor_type_op = builder.create<infrt::TensorCastOp>(
+          kernel_op.getLoc(),
+          infrt::DenseTensorType::get(
+              kernel_op.getContext(),
+              phi_kernel_desc.input_types[index].target,
+              phi_kernel_desc.input_types[index].precision,
+              phi_kernel_desc.input_types[index].layout),
+          input);
+      operation_state.addOperands(cvt_tensor_type_op.output());
+    }
+
+    for (size_t index = 0; index < phi_kernel_desc.output_types.size();
+         ++index) {
+      operation_state.addTypes(infrt::DenseTensorType::get(
+          kernel_op.getContext(),
+          phi_kernel_desc.output_types[index].target,
+          phi_kernel_desc.output_types[index].precision,
+          phi_kernel_desc.output_types[index].layout));
+    }
+    operation_state.addAttributes(kernel_op.attrsAttr().getValue());
+    mlir::Operation *phi_operation = builder.createOperation(operation_state);
+    for (size_t index = 0; index < phi_kernel_desc.output_types.size();
+         ++index) {
+      mlir::Value input = phi_operation->getResult(index);
+      auto cvt_tensor_type_op = builder.create<infrt::TensorCastOp>(
+          kernel_op.getLoc(), kernel_op.getResultTypes()[index], input);
+      kernel_op.getResult(index).replaceAllUsesWith(
+          cvt_tensor_type_op.output());
+    }
+    kernel_op.erase();
+  }
+}
+
+PhiOpConvertPass::PhiOpConvertPass() {
+  if (!valid_places_options_.hasValue()) {
+    valid_places_.emplace_back(infrt::TargetType::CPU,
+                               infrt::PrecisionType::FLOAT32,
+                               infrt::LayoutType::NCHW);
+    return;
+  }
+
+  LOG(FATAL) << "To be done for specifying places in command line";
+}
+
+void PhiOpConvertPass::getDependentDialects(
+    mlir::DialectRegistry &registry) const {
+  registry.insert<infrt::InfrtDialect>();
+  registry.insert<infrt::phi::PHIDialect>();
+  registry.insert<infrt::phi::PHIDenseTensorDialect>();
+  registry.insert<infrt::phi::PHICPUKernelDialect>();
+  registry.insert<infrt::phi::PHIGPUKernelDialect>();
+}
+
+}  // namespace
+
+mlir::PassRegistration<PhiOpConvertPass> phi_op_convert;
+
+std::unique_ptr<mlir::Pass> infrt::createPhiOpCvtPass(
+    std::vector<Place> valid_places) {
+  return std::make_unique<PhiOpConvertPass>(valid_places);
+}
+
+std::unique_ptr<mlir::Pass> infrt::createPhiOpCvtPass() {
+  return std::make_unique<PhiOpConvertPass>();
+}
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
similarity index 86%
rename from paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
rename to paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
index 8b1944042aa7c..5a2c0ee96ed0d 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h
@@ -21,7 +21,8 @@ namespace infrt {
  * phiOpCvtPass.
  * Convert the general operators from pd Dialect to phi dialect.
  */
-std::unique_ptr<mlir::Pass> createPhiOpCvtPass(
-    std::vector<Place> valid_places = std::vector<Place>());
+std::unique_ptr<mlir::Pass> createPhiOpCvtPass(std::vector<Place> valid_places);
+
+std::unique_ptr<mlir::Pass> createPhiOpCvtPass();
 
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
deleted file mode 100644
index 485bf2a75d890..0000000000000
--- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc
+++ /dev/null
@@ -1,212 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
-
-#include <glog/logging.h>
-#include <llvm/ADT/SetVector.h>
-#include <mlir/Analysis/SliceAnalysis.h>
-#include <mlir/IR/Builders.h>
-#include <mlir/IR/Operation.h>
-#include <mlir/IR/OperationSupport.h>
-#include <list>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
-#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
-#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h"
-#include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/phi/ops/compat/signatures.h"
-
-namespace {
-class phiOpCvtPass
-    : public mlir::PassWrapper<phiOpCvtPass, mlir::FunctionPass> {
- public:
-  ::llvm::StringRef getName() const override { return "phiOpCvtPass"; }
-  void runOnFunction() override;
-  explicit phiOpCvtPass(
-      std::vector<infrt::Place> valid_places = std::vector<infrt::Place>())
-      : valid_places_(valid_places) {}
-
- private:
-  void convertStage();
-  void diapatchStage();
-  std::vector<infrt::Place> valid_places_;
-};
-
-// Implementation of the phiOpCvtPass.
-void phiOpCvtPass::runOnFunction() {
-  convertStage();
-  diapatchStage();
-}
-void phiOpCvtPass::convertStage() {
-  mlir::Block &body = getFunction().front();
-  std::vector<mlir::Operation *> worklist;
-  for (auto &op : body.without_terminator()) {
-    worklist.push_back(&op);
-  }
-  mlir::OpBuilder builder(&body, body.begin());
-  while (!worklist.empty()) {
-    auto *op = worklist.back();
-    worklist.pop_back();
-    if (op == nullptr) continue;
-
-    std::string op_name = op->getName().getIdentifier().str();
-
-    // only convert op in pd dialect.
-    if (op_name.substr(0, 3) != "pd.") continue;
-    op_name = op_name.substr(3);
-    if (pd_dialect_inputs_info_map_.find(op_name) ==
-            pd_dialect_inputs_info_map_.end() ||
-        pd_dialect_outputs_info_map_.find(op_name) ==
-            pd_dialect_outputs_info_map_.end()) {
-      // Todo: print log
-      continue;
-    }
-
-    ::phi::KernelSignature kernel_sign =
-        ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)(
-            infrt::ProtoArgumentMappingContext(op));
-    // resort input&output according to kernel_sign
-    ::llvm::SmallVector<mlir::Value, 4> inputs, ori_output;
-    ::llvm::SmallVector<mlir::Type, 4> output_types;
-    for (const std::string &str : std::get<0>(kernel_sign.args)) {
-      if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) {
-        // Todo: print error log
-        return;
-      }
-      uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str);
-      inputs.push_back(op->getOperands()[index]);
-    }
-
-    for (const std::string &str : std::get<2>(kernel_sign.args)) {
-      if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) {
-        // Todo: print error log
-        return;
-      }
-      uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str);
-      output_types.push_back(op->getResultTypes()[index]);
-      ori_output.push_back(op->getResult(index));
-    }
-
-    auto loc = getFunction().getLoc();
-    builder.setInsertionPoint(op);
-    auto kernel_op = builder.create<infrt::KernelOp>(
-        loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary());
-    for (size_t index = 0; index < ori_output.size(); ++index) {
-      ori_output[index].replaceAllUsesWith(kernel_op.getResult(index));
-    }
-    if (!op->use_empty()) {
-      // Todo: print error log
-      return;
-    }
-    op->erase();
-  }
-}
-void phiOpCvtPass::diapatchStage() {
-  std::vector<infrt::KernelOp> worklist;
-  mlir::Block &block = getFunction().front();
-  for (auto &op : block) {
-    infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null<infrt::KernelOp>(&op);
-    if (nullptr != kernel_op) worklist.push_back(kernel_op);
-  }
-
-  mlir::OpBuilder builder(&block, block.begin());
-  std::map<infrt::TargetType, mlir::Value> phi_context;
-  for (infrt::KernelOp kernel_op : worklist) {
-    std::string kernel_name = kernel_op.name().str();
-    std::vector<infrt::PhiKernelDesc> candidates =
-        getCandidateKernels(kernel_name, valid_places_);
-    if (candidates.empty()) {
-      LOG(FATAL) << "No candidate kernels for op:" << kernel_name;
-      continue;
-    }
-    builder.setInsertionPoint(kernel_op);
-
-    // Todo: Implimentation the concrete pass pick strategy
-    const infrt::PhiKernelDesc &phi_kernel_desc = candidates.front();
-
-    kernel_name =
-        infrt::getPhiTargetPrefix(phi_kernel_desc.kernelType.target) +
-        kernel_name +
-        infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) +
-        infrt::getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout);
-
-    mlir::OperationName operation_name(kernel_name, kernel_op.getContext());
-    mlir::OperationState operation_state(kernel_op.getLoc(), operation_name);
-
-    if (phi_context.find(phi_kernel_desc.kernelType.target) ==
-        phi_context.end()) {
-      switch (phi_kernel_desc.kernelType.target) {
-        case infrt::TargetType::CPU: {
-          auto context_value =
-              builder
-                  .create<infrt::phi::CreateCPUContextOp>(
-                      kernel_op.getLoc(),
-                      infrt::phi::ContextType::get(kernel_op.getContext(),
-                                                   infrt::TargetType::CPU))
-                  .output();
-          phi_context[infrt::TargetType::CPU] = context_value;
-        } break;
-        case infrt::TargetType::GPU:
-        case infrt::TargetType::UNK:
-        default:
-          LOG(FATAL) << "Unsupported TargetType";
-          break;
-      }
-    }
-    operation_state.addOperands(
-        phi_context.at(phi_kernel_desc.kernelType.target));
-    for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) {
-      mlir::Value input = kernel_op.getOperand(index);
-      auto cvt_tensor_type_op = builder.create<infrt::CvtTensorOp>(
-          kernel_op.getLoc(),
-          infrt::DenseTensorType::get(
-              kernel_op.getContext(),
-              phi_kernel_desc.inputsType[index].target,
-              phi_kernel_desc.inputsType[index].precision,
-              phi_kernel_desc.inputsType[index].layout),
-          input);
-      operation_state.addOperands(cvt_tensor_type_op.output());
-    }
-    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
-         ++index) {
-      operation_state.addTypes(infrt::DenseTensorType::get(
-          kernel_op.getContext(),
-          phi_kernel_desc.outputsType[index].target,
-          phi_kernel_desc.outputsType[index].precision,
-          phi_kernel_desc.outputsType[index].layout));
-    }
-    operation_state.addAttributes(kernel_op.attrsAttr().getValue());
-    mlir::Operation *phi_operation = builder.createOperation(operation_state);
-    for (size_t index = 0; index < phi_kernel_desc.outputsType.size();
-         ++index) {
-      mlir::Value input = phi_operation->getResult(index);
-      auto cvt_tensor_type_op = builder.create<infrt::CvtTensorOp>(
-          kernel_op.getLoc(), kernel_op.getResultTypes()[index], input);
-      kernel_op.getResult(index).replaceAllUsesWith(
-          cvt_tensor_type_op.output());
-    }
-    kernel_op.erase();
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<mlir::Pass> infrt::createPhiOpCvtPass(
-    std::vector<Place> valid_places) {
-  return std::make_unique<phiOpCvtPass>(valid_places);
-}
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
index 64b184359700e..1cd5b5a85511f 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
@@ -60,6 +60,10 @@ bool ProtoArgumentMappingContext::IsSelectedRowsInput(
     const std::string& name) const {
   return false;
 }
+bool ProtoArgumentMappingContext::IsDenseTensorVectorInput(
+    const std::string& name) const {
+  return false;
+}
 
 bool ProtoArgumentMappingContext::IsDenseTensorOutput(
     const std::string& name) const {
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index e4e9b5c3ff8a1..5cf2ef979076d 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <mlir/IR/Operation.h>
 #include <unordered_map>
-#include "paddle/infrt/dialect/pd_ops_info.h"
+#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 
 namespace infrt {
@@ -42,6 +42,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {
 
   bool IsDenseTensorInput(const std::string& name) const override;
   bool IsSelectedRowsInput(const std::string& name) const override;
+  bool IsDenseTensorVectorInput(const std::string& name) const override;
 
   bool IsDenseTensorOutput(const std::string& name) const override;
   bool IsSelectedRowsOutput(const std::string& name) const override;
diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc
index de61dba8e744c..0beb5bff29f6d 100644
--- a/paddle/infrt/dialect/phi/phi_ir_exec.cc
+++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc
@@ -18,7 +18,7 @@
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
 
 int main(int argc, char** argv) {
   static llvm::cl::opt<std::string> input_file(
diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
index 46c250b05492c..6467c1285f85e 100644
--- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
+++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td
@@ -3,7 +3,7 @@
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
-include "paddle/infrt/dialect/pd_ops.td"
+include "paddle/infrt/dialect/pd/ir/pd_ops.td"
 include "paddle/infrt/dialect/tensorrt/trt_ops.td"
 
 def PD2TRT_Matmul_Lower : Pat<
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index ad6b136463a71..0878163a955af 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -17,11 +17,12 @@
 #include <llvm/ADT/SetVector.h>
 #include <mlir/Analysis/SliceAnalysis.h>
 #include <mlir/IR/Builders.h>
-#include <paddle/infrt/dialect/pd_ops.h>
 #include <list>
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
+
 namespace infrt {
 namespace trt {
 namespace {
@@ -54,8 +55,8 @@ bool reverseDfs(std::vector<mlir::Operation *> source,
 
 // merge the first&second graph op to a new graph op.
 void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
-                             mlir::pd::GraphOp first,
-                             mlir::pd::GraphOp second) {
+                             infrt::pd::GraphOp first,
+                             infrt::pd::GraphOp second) {
   // comput inputs and outputs
   ::llvm::SmallVector<mlir::Value, 4> inputs(first.getOperands()), outputs;
   for (mlir::Value input : second.getOperands()) {
@@ -84,7 +85,7 @@ void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
   // create the new graph op
   builder.setInsertionPoint(first);
   auto loc = first.getLoc();
-  auto graph_op = builder.create<mlir::pd::GraphOp>(loc, return_types, inputs);
+  auto graph_op = builder.create<infrt::pd::GraphOp>(loc, return_types, inputs);
   mlir::Block *block = new mlir::Block;
   auto copy_range = second.getBody()->without_terminator();
   block->getOperations().splice(block->begin(),
@@ -149,13 +150,13 @@ void TRTGraphFusePass::runOnFunction() {
   do {
     changed = false;
     for (auto &op : body) {
-      mlir::pd::GraphOp graph_op =
-          ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
+      infrt::pd::GraphOp graph_op =
+          ::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(&op);
       if (nullptr == graph_op) continue;
 
       for (auto user_op : op.getUsers()) {
-        mlir::pd::GraphOp user_graph_op =
-            ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(user_op);
+        infrt::pd::GraphOp user_graph_op =
+            ::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(user_op);
         if (nullptr == user_graph_op) continue;
         // get all dst input nodes except src.
         std::vector<mlir::Operation *> source_nodes;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index e3a7b455024c6..ade61bfc370f5 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -15,24 +15,24 @@
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
 
 #include <mlir/IR/Builders.h>
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
 void TRTGraphSplitPass::runOnFunction() {
-  std::vector<mlir::pd::GraphOp> worklist;
+  std::vector<infrt::pd::GraphOp> worklist;
   mlir::Block& block = getFunction().front();
   for (auto& op : block) {
-    mlir::pd::GraphOp graph_op =
-        ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
+    infrt::pd::GraphOp graph_op =
+        ::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(&op);
     if (nullptr != graph_op &&
         graph_op.getBody()->getOperations().size() <= min_subgraph_size_) {
       worklist.push_back(graph_op);
     }
   }
   while (!worklist.empty()) {
-    mlir::pd::GraphOp graph_op = worklist.back();
+    infrt::pd::GraphOp graph_op = worklist.back();
     worklist.pop_back();
     mlir::Block* body = graph_op.getBody();
     auto return_op = body->getTerminator();
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
index 83bebdb6bf19b..19c6b13e971ec 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc
@@ -14,7 +14,7 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
 #include <mlir/IR/Builders.h>
 #include <mlir/Transforms/DialectConversion.h>
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
@@ -27,7 +27,7 @@ struct PD2TRT_GraphLower : public ::mlir::RewritePattern {
       : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {}
   ::mlir::LogicalResult matchAndRewrite(
       ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override {
-    auto casted_op = ::llvm::dyn_cast<mlir::pd::GraphOp>(op);
+    auto casted_op = ::llvm::dyn_cast<infrt::pd::GraphOp>(op);
     ::mlir::Operation::operand_range inputs = casted_op.inputs();
     auto ods_loc = rewriter.getFusedLoc(op->getLoc());
     CreateEngineOp create_engine_op;
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 9f348b4122fc7..ef9ccc82678f4 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -17,7 +17,7 @@
 #include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -35,13 +35,13 @@ void TRTOpTellerPass::runOnFunction() {
     auto *op = worklist.back();
     worklist.pop_back();
     if (op == nullptr) continue;
-    if (::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op)) continue;
-    if (::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op)) continue;
-    if (::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<infrt::pd::FeedOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<infrt::pd::FetchOp>(op)) continue;
+    if (::llvm::dyn_cast_or_null<infrt::pd::GraphOp>(op)) continue;
     if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue;
     builder.setInsertionPoint(op);
     auto loc = getFunction().getLoc();
-    auto graph_op = builder.create<mlir::pd::GraphOp>(
+    auto graph_op = builder.create<infrt::pd::GraphOp>(
         loc, op->getResultTypes(), op->getOperands());
 
     ::llvm::SmallVector<mlir::Value, 4> tblgen_repl_values;
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index d5222976625a2..415a78a6967ab 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -21,6 +21,10 @@
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+
 namespace infrt {
 namespace trt {
 
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index 78d960b512045..76768037dbdb3 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -30,7 +30,7 @@
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
index 132a1d7805bdb..803a11ed5b7e5 100755
--- a/paddle/infrt/dialect/tensorrt/trt_ops.td
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -7,6 +7,8 @@ include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
 
+include "paddle/infrt/dialect/infrt/ir/infrt_base.td"
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 
 def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
   let summary = "trt CreateEngine Op";
@@ -14,8 +16,8 @@ def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<
     Describe a tensorrt subgraph.
   }];
   let regions = (region SizedRegion<1>:$body);
-  let arguments = (ins Variadic<TRT_Tensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
-  let results = (outs TRT_EngineType:$output);
+  let arguments = (ins Variadic<DenseTensor>:$inputs, DefaultValuedAttr<BoolAttr, "true">:$run_once);
+  let results = (outs TRT_EngineType:$engine);
 }
 
 def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
@@ -23,8 +25,25 @@ def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> {
   let description = [{
     Describe a tensorrt runtime.
   }];
-  let arguments = (ins TRT_EngineType:$engine, Variadic<TRT_Tensor>:$inputs);
-  let results = (outs Variadic<TRT_Tensor>:$output);
+  let arguments = (ins TRT_EngineType:$engine, Variadic<DenseTensor>:$inputs);
+  let results = (outs Variadic<DenseTensor>:$output);
+}
+
+def TRT_EngineComputeOp : TRT_Op<"compute", [NoSideEffect]> {
+  let summary = "trt compute engine";
+  let description = [{
+    execute engine
+  }];
+  let arguments = (ins TRT_EngineType:$engine, Context:$context);
+  let results = (outs DenseTensorList:$outputs);
+}
+
+def TRT_InspectEngineOp : TRT_Op<"inspect_engine", [NoSideEffect]> {
+  let summary = "trt inspect engine";
+  let description = [{
+    Show engine
+  }];
+  let arguments = (ins TRT_EngineType:$engine);
 }
 
 def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
@@ -34,11 +53,44 @@ def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> {
     TensorRT IActivationLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input, SI32Attr:$activation_type,
+  let arguments = (ins  DenseTensor:$input, SI32Attr:$activation_type,
                         DefaultValuedAttr<F32Attr, "0.0">:$alpha,
                         DefaultValuedAttr<F32Attr, "0.0">:$beta);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
+}
+
+def TRT_FullyConnectedOp : TRT_Op<"FullyConnected", [NoSideEffect]> {
+  let summary = "TensorRT IFullyConnectedLayer";
+  let description = [{
+    TensorRT IFullyConnectedLayer
+  }];
+  let arguments = (ins
+    DenseTensor:$input_tensor,
+    DenseTensor:$kernel_weights,
+    DenseTensor:$bias_weights,
+    SI32Attr:$out_channel_num
+  );
+  let results = (outs
+    DenseTensor:$output_tensor
+  );
+}
+
+def TRT_ConvolutionOp : TRT_Op<"Convolution", [NoSideEffect]> {
+  let summary = "TensorRT IConvolutionLayer";
+  let description = [{
+    TensorRT IConvolutionLayer
+  }];
+  let arguments = (ins
+    DenseTensor:$input_tensor,
+    DenseTensor:$kernel_weights,
+    DenseTensor:$bias_weights,
+    SI32Attr:$out_channel_num,
+    I32ArrayAttr:$kernel_size
+  );
+  let results = (outs
+    DenseTensor:$output_tensor
+  );
 }
 
 def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
@@ -48,9 +100,9 @@ def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> {
     TensorRT IElementWiseLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation);
+  let arguments = (ins  DenseTensor:$input1, DenseTensor:$input2, SI32Attr:$elementwise_operation);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
 }
 
 def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
@@ -60,10 +112,10 @@ def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> {
     TensorRT IMatrixMultiplyLayer.
     
   }];
-  let arguments = (ins  TRT_Tensor:$input1, BoolAttr:$transpose1,
-                        TRT_Tensor:$input2, BoolAttr:$transpose2);
+  let arguments = (ins  DenseTensor:$input1, BoolAttr:$transpose1,
+                        DenseTensor:$input2, BoolAttr:$transpose2);
 
-  let results = (outs TRT_Tensor:$output);
+  let results = (outs DenseTensor:$output);
 }
 
 #endif  // TRT_OPS
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 1506282f62681..81bf873ddf0cf 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -30,10 +30,13 @@
 #include "paddle/infrt/kernel/test_kernels.h"
 #ifdef INFRT_WITH_PHI
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
-#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
 #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
 #include "paddle/infrt/kernel/phi/registry.h"
-#endif
+#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
+#include "paddle/infrt/kernel/tensorrt/registry.h"
+#endif  // INFRT_WITH_GPU && INFRT_WITH_TRT
+#endif  // INFRT_WITH_PHI
 
 static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
     "shared_libs",
@@ -62,6 +65,9 @@ int main(int argc, char** argv) {
 #ifdef INFRT_WITH_PHI
   kernel::RegisterPhiKernels(&registry);
   kernel::RegisterInferShapeLaunchers(&registry);
+#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
+  kernel::RegisterTrtKernels(&registry);
+#endif  // INFRT_WITH_GPU && INFRT_WITH_TRT
 #endif
 
   // load extra shared library
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index c613843cd1779..7e90f225cffa7 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -16,12 +16,14 @@
 
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinAttributes.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 
+#include <glog/logging.h>
 #include <iostream>
 #include <memory>
 #include <string>
@@ -42,6 +44,13 @@
 #include "paddle/infrt/host_context/value.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
+#ifdef INFRT_WITH_PHI
+#ifdef INFRT_WITH_TRT
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#endif
+#include "paddle/phi/core/dense_tensor.h"
+#endif
+
 namespace infrt {
 namespace host_context {
 
@@ -277,51 +286,91 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
       impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
 
   VLOG(3) << "processing general op : " << op->getName().getStringRef().str();
+  // TODO(wilber): Find a more appropriate way to handle special cases.
+  if (op->getName().getStringRef() == "trt.create_engine") {
+#ifdef INFRT_WITH_TRT
+    auto* symbols = impl_->runtime->symbol_table();
+    ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol mlir_operation;
+    mlir_operation.operation = op;
+    mlir_operation.symbol_table = symbols;
+    impl_->cur_op->AppendArgument(new Value(mlir_operation));
+    // TODO(wilber): how to pass DenseTensor to create_engine op? temporialiy
+    // add a naive implement.
+    for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
+      auto operand = op->getOperand(i);
+      Value* arg_value{nullptr};
+      if (operand.isa<mlir::BlockArgument>()) {
+        mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
+        arg_value = GetValue(arg);
+      } else {
+        arg_value = GetValue(operand);
+        if (!arg_value) {
+          auto upstream_op = operand.getDefiningOp();
+          arg_value = GetOpResult(upstream_op);
+        }
+      }
+      if (arg_value->is_type<phi::DenseTensor>()) {
+        impl_->runtime->FeedInArgs(
+            std::make_pair(std::to_string(i), ValueRef(arg_value)));
+      }
+    }
+#else
+    CHECK(false) << "should not reach here";
+#endif
+  } else {
+    // process operands
+    for (int i = 0, e = op->getNumOperands(); i < e; i++) {
+      // function argument as value
+      auto operand = op->getOperand(i);
+      /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+      if (operand.isa<mlir::BlockArgument>()) {
+        mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
+        Value* arg_value = GetValue(arg);
+        impl_->cur_op->AppendArgument(arg_value);
+        VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
+                << GetValue(arg);
+        continue;
+      }
 
-  // process operands
-  for (int i = 0, e = op->getNumOperands(); i < e; i++) {
-    // function argument as value
-    auto operand = op->getOperand(i);
-    /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
-    if (operand.isa<mlir::BlockArgument>()) {
-      mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
-      Value* arg_value = GetValue(arg);
+      // normal value
+      Value* arg_value = GetValue(operand);
+      if (!arg_value) {
+        auto upstream_op = operand.getDefiningOp();
+        arg_value = GetOpResult(upstream_op);
+      }
+      CHECK(arg_value) << "No-exist argument value found: "
+                       << DumpToString(operand);
       impl_->cur_op->AppendArgument(arg_value);
-      VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
-              << GetValue(arg);
-      continue;
-    }
 
-    // normal value
-    Value* arg_value = GetValue(operand);
-    if (!arg_value) {
-      auto upstream_op = operand.getDefiningOp();
-      arg_value = GetOpResult(upstream_op);
+      VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
+              << GetValue(operand) << " vs " << arg_value;
     }
-    CHECK(arg_value) << "No-exist argument value found: "
-                     << DumpToString(operand);
-    impl_->cur_op->AppendArgument(arg_value);
-
-    VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
-            << GetValue(operand) << " vs " << arg_value;
   }
 
   // process attributes
   auto attrs = op->getAttrs();
 
   // MLIR's underlying attr storage type is `Builtin_Dictionary`, and its
-  // elements
-  // are sorted by name. The following code adapts the order of function
-  // signatures
-  // of the phi operator library.
+  // elements are sorted by name. The following code adapts the order of
+  // function signatures of the phi operator library.
   llvm::SmallVector<Value*, 4> tmp;
   tmp.resize(attrs.size());
   const std::string& kernel_name = op->getName().getStringRef().str();
   const auto& attr_names = kernel_registry.GetAttrNameList(kernel_name);
-  if (attrs.size() && attr_names.empty()) {
-    LOG(WARNING) << "The kernel `" << kernel_name
-                 << "` has no specified attr order.";
+  if (attrs.size()) {
+    if (attr_names.empty()) {
+      LOG(WARNING) << "The kernel `" << kernel_name
+                   << "` has not been registered with "
+                      "`KernelRegistry::AddKernelWithAttrs()`.";
+    } else {
+      CHECK_EQ(attr_names.size(), attrs.size())
+          << "The number of kernel `" << kernel_name
+          << "` attributes specified by mlir (" << attrs.size()
+          << ") is inconsistent with the registration (" << attr_names.size()
+          << ").";
+    }
   }
+
   auto get_offset = [](const char* attr,
                        const std::vector<const char*>& names,
                        const std::string& kernel_name) -> int {
@@ -344,7 +393,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
     } else {
       offset = i;
     }
-    CHECK_NE(offset, -1);
+    CHECK_GT(offset, -1);
     if (auto v = EmitAttribute<int32_t>(attr.getValue())) {
       tmp[offset] = new Value(*v);
     } else if (auto v = EmitAttribute<int64_t>(attr.getValue())) {
@@ -383,33 +432,6 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
     impl_->cur_op->AppendAttribute(tmp[i]);
   }
 
-  // process results
-  llvm::SmallVector<Value*, 4> res_values;
-  for (int i = 0, e = op->getNumResults(); i < e; i++) {
-    auto res = op->getResult(i);
-    if (res.getType().isa<::infrt::DenseTensorType>()) {
-      auto r = impl_->value_map.try_emplace(
-          res, ValueRef(new Value{::phi::DenseTensor()}));
-      CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
-                      << "]";
-      res_values.push_back(r.first->second.get());
-    } else {
-      res_values.push_back(AddValue(res));
-    }
-
-    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
-  }
-  impl_->cur_op->SetResults(res_values);
-
-#ifdef INFRT_DEBUG
-  {
-    VLOG(3) << "check result";
-    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
-      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
-    }
-  }
-#endif
-
   // process regions, we treat regions as attribute.
   auto num_regions = op->getNumRegions();
   if (num_regions > 0) {
@@ -438,6 +460,33 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
     impl_->cur_op->AppendAttribute(new Value(function));
   }
 
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    if (res.getType().isa<::infrt::DenseTensorType>()) {
+      auto r = impl_->value_map.try_emplace(
+          res, ValueRef(new Value{::phi::DenseTensor()}));
+      CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
+                      << "]";
+      res_values.push_back(r.first->second.get());
+    } else {
+      res_values.push_back(AddValue(res));
+    }
+
+    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
+  }
+  impl_->cur_op->SetResults(res_values);
+
+#ifdef INFRT_DEBUG
+  {
+    VLOG(3) << "check result";
+    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
+      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
+    }
+  }
+#endif
+
   return true;
 }
 
diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc
index 18c25827b8ec5..ec12815e3ce94 100644
--- a/paddle/infrt/host_context/paddle_mlir.cc
+++ b/paddle/infrt/host_context/paddle_mlir.cc
@@ -13,15 +13,20 @@
 // limitations under the License.
 
 #include "paddle/infrt/host_context/paddle_mlir.h"
-#include "paddle/infrt/dialect/pd_ops_info.h"
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+#include "paddle/infrt/dialect/pd/common/pd_ops_info.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 
 MLIRModelGenImpl::MLIRModelGenImpl()
     : context_(infrt::Global::getMLIRContext()), builder_(context_) {
-  context_->allowUnregisteredDialects();
   context_->getOrLoadDialect<mlir::StandardOpsDialect>();
   context_->getOrLoadDialect<infrt::ts::TensorShapeDialect>();
   context_->getOrLoadDialect<infrt::dt::DTDialect>();
-  context_->getOrLoadDialect<mlir::pd::PaddleDialect>();
+  context_->getOrLoadDialect<infrt::pd::PaddleDialect>();
+  context_->getOrLoadDialect<::infrt::InfrtDialect>();
+  context_->getOrLoadDialect<::infrt::phi::PHIDialect>();
+  context_->getOrLoadDialect<::infrt::phi::PHIDenseTensorDialect>();
   module_ = mlir::ModuleOp::create(mlir::UnknownLoc::get(context_));
 }
 
@@ -55,7 +60,6 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel(
   UpdateModelParams(program, &mainFunc);
   UpdateModelOps(program);
   UpdateModelOutputs(program);
-
   return module_;
 }
 
@@ -78,7 +82,7 @@ mlir::FuncOp MLIRModelGenImpl::UpdateModelModule(
 llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelInputsType(
     const infrt::paddle::framework_proto::ProgramDesc &program) {
   llvm::SmallVector<mlir::Type, 4> operandTypes;
-  operandTypes.push_back(infrt::DenseTensorMapType::get(context_));
+  operandTypes.push_back(infrt::phi::DenseTensorMapType::get(context_));
   for (auto &op_desc : main_block_.ops()) {
     if (op_desc.type() != "feed") continue;
     for (int var_idx = 0; var_idx < op_desc.outputs_size(); ++var_idx) {
@@ -90,11 +94,15 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelInputsType(
         if (var_desc.name() == input_var_name) {
           std::vector<int64_t> dims = RepeatedToVector<int64_t>(
               var_desc.type().lod_tensor().tensor().dims());
-          mlir::Type precision_;
-          ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                          builder_,
-                          &precision_);
-          mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+          infrt::PrecisionType precision_;
+          ConvertDataTypeToPhi(
+              var_desc.type().lod_tensor().tensor().data_type(), &precision_);
+          mlir::Type type_ =
+              infrt::DenseTensorType::get(context_,
+                                          infrt::TargetType::CPU,
+                                          precision_,
+                                          infrt::LayoutType::ANY);
+
           operandTypes.push_back(type_);
         }
       }
@@ -116,11 +124,14 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetModelOutputsType(
         if (var_desc.name() == input_var_name) {
           std::vector<int64_t> dims = RepeatedToVector<int64_t>(
               var_desc.type().lod_tensor().tensor().dims());
-          mlir::Type precision_;
-          ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                          builder_,
-                          &precision_);
-          mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+          infrt::PrecisionType precision_;
+          ConvertDataTypeToPhi(
+              var_desc.type().lod_tensor().tensor().data_type(), &precision_);
+          mlir::Type type_ =
+              infrt::DenseTensorType::get(context_,
+                                          infrt::TargetType::CPU,
+                                          precision_,
+                                          infrt::LayoutType::ANY);
           resultTypes.push_back(type_);
         }
       }
@@ -167,12 +178,12 @@ void MLIRModelGenImpl::UpdateModelParams(
       auto name = builder_.getStringAttr(var_desc.name());
       std::vector<int64_t> dims = RepeatedToVector<int64_t>(
           var_desc.type().lod_tensor().tensor().dims());
-      mlir::Type precision_;
-      ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                      builder_,
-                      &precision_);
-      mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
-      auto op = builder_.create<infrt::dt::TensorMapGetTensorOp>(
+      infrt::PrecisionType precision_;
+      ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(),
+                           &precision_);
+      mlir::Type type_ = infrt::DenseTensorType::get(
+          context_, infrt::TargetType::CPU, precision_, infrt::LayoutType::ANY);
+      auto op = builder_.create<::infrt::phi::TensorMapGetTensorOp>(
           mlir::UnknownLoc::get(context_), type_, map, name);
       params_map_.insert(std::pair<std::string, mlir::Value>(
           var_desc.name(), op.getOperation()->getResult(0)));
@@ -197,8 +208,9 @@ void MLIRModelGenImpl::UpdateModelOutputs(
 
         llvm::SmallVector<mlir::Type, 4> resultTypes;
         llvm::SmallVector<mlir::NamedAttribute, 4> attrs;
+
         mlir::OperationState state(loc,
-                                   mlir::ReturnOp::getOperationName(),
+                                   ::infrt::ReturnOp::getOperationName(),
                                    operands,
                                    resultTypes,
                                    attrs);
@@ -256,11 +268,13 @@ llvm::SmallVector<mlir::Type, 4> MLIRModelGenImpl::GetOpOutputType(
       if (var_desc.name() == var_name) {
         std::vector<int64_t> dims = RepeatedToVector<int64_t>(
             var_desc.type().lod_tensor().tensor().dims());
-        mlir::Type precision_;
-        ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(),
-                        builder_,
-                        &precision_);
-        mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_);
+        infrt::PrecisionType precision_;
+        ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(),
+                             &precision_);
+        mlir::Type type_ = infrt::DenseTensorType::get(context_,
+                                                       infrt::TargetType::CPU,
+                                                       precision_,
+                                                       infrt::LayoutType::ANY);
         resultTypes.push_back(type_);
       }
     }
@@ -321,7 +335,7 @@ llvm::SmallVector<mlir::NamedAttribute, 4> MLIRModelGenImpl::GetOpAttributes(
     switch (type) {
       ATTR_IMPL_CASE(FLOAT, f, getF32FloatAttr);
       ATTR_IMPL_CASE(BOOLEAN, b, getBoolAttr);
-      ATTR_IMPL_CASE(INT, i, getI32IntegerAttr);
+      ATTR_IMPL_CASE(INT, i, getSI32IntegerAttr);
       ATTR_IMPL_CASE(LONG, l, getI64IntegerAttr);
       ATTR_IMPL_CASE(STRING, s, getStringAttr);
 
@@ -397,3 +411,38 @@ bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype,
       return false;
   }
 }
+
+bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype,
+                          infrt::PrecisionType *type) {
+  switch (dtype) {
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP16:
+      *type = infrt::PrecisionType::FLOAT16;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP32:
+      *type = infrt::PrecisionType::FLOAT32;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP64:
+      *type = infrt::PrecisionType::FLOAT64;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_BOOL:
+      *type = infrt::PrecisionType::BOOL;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT8:
+      *type = infrt::PrecisionType::INT8;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT16:
+      *type = infrt::PrecisionType::INT16;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT32:
+      *type = infrt::PrecisionType::INT32;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT64:
+      *type = infrt::PrecisionType::INT64;
+      return true;
+    case infrt::paddle::framework_proto::VarType::Type::VarType_Type_UINT8:
+      *type = infrt::PrecisionType::UINT8;
+      return true;
+    default:
+      return false;
+  }
+}
diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h
index e825cbb5a11ea..a351b5cf80e23 100644
--- a/paddle/infrt/host_context/paddle_mlir.h
+++ b/paddle/infrt/host_context/paddle_mlir.h
@@ -14,22 +14,22 @@
 #ifndef PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
 #define PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
 
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/AsmState.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/MLIRContext.h>
 #include <fstream>
 #include <iostream>
 #include <string>
 
-#include "llvm/Support/CommandLine.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/AsmState.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/MLIRContext.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/common/string.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
-
 #include "paddle/infrt/dialect/init_dialects.h"
-#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/pd/ir/pd_ops.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 #include "paddle/infrt/paddle/model_parser.h"
 
@@ -102,4 +102,7 @@ inline std::vector<T> RepeatedToVector(
 bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype,
                      mlir::Builder builder,
                      mlir::Type *type);
+bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype,
+                          infrt::PrecisionType *type);
+
 #endif  // PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 957d852442b10..5b92d183b79da 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -24,6 +24,7 @@
 #include "paddle/infrt/common/shared.h"
 #include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/host_context/function.h"
+#include "paddle/infrt/host_context/symbol_table.h"
 #include "paddle/infrt/support/variant.h"
 #include "paddle/infrt/tensor/dense_host_tensor.h"
 #include "paddle/infrt/tensor/dense_tensor_view.h"
@@ -33,6 +34,7 @@
 #ifdef INFRT_WITH_PHI
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/backends/host/phi_context.h"
+#include "paddle/infrt/tensor/phi/tensor_map.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
@@ -41,7 +43,15 @@
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/meta_tensor.h"
-#endif
+
+#ifdef INFRT_WITH_GPU
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#endif  // INFRT_WITH_GPU
+#ifdef INFRT_WITH_TRT
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#endif  // INFRT_WITH_TRT
+#endif  // INFRT_WITH_PHI
 
 namespace infrt {
 namespace host_context {
@@ -72,16 +82,26 @@ using ValueVariantType =
             ::phi::MetaTensor,
             ::phi::DenseTensor,
             backends::CpuPhiContext,
+#ifdef INFRT_WITH_GPU
+            backends::GpuPhiContext,
+            ::phi::GPUContext,
+#endif  // INFRT_WITH_GPU
             ::phi::CPUContext,
-            std::vector<const phi::DenseTensor*>,
-            paddle::experimental::ScalarBase<phi::DenseTensor>,
-            paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
-            std::vector<phi::MetaTensor*>,
-            phi::MetaConfig,
+            std::vector<const ::phi::DenseTensor*>,
+            std::vector<::phi::DenseTensor*>,
+            paddle::experimental::ScalarBase<::phi::DenseTensor>,
+            paddle::experimental::ScalarArrayBase<::phi::DenseTensor>,
+            std::vector<::phi::MetaTensor*>,
+            ::phi::MetaConfig,
             paddle::experimental::Backend,
             paddle::experimental::DataLayout,
             paddle::experimental::DataType,
-#endif
+            ::infrt::phi::DenseTensorMap,
+#endif  // INFRT_WITH_PHI
+#ifdef INFRT_WITH_TRT
+            ::infrt::backends::tensorrt::TrtEngine,
+            ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol,
+#endif  // INFRT_WITH_TRT
             std::vector<int16_t>,
             std::vector<int32_t>,
             std::vector<int64_t>,
@@ -118,10 +138,21 @@ class Value : public common::Object {
   explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {}
   explicit Value(MlirFunctionExecutable* x) : data(x) {}
 #ifdef INFRT_WITH_PHI
+  explicit Value(::infrt::phi::DenseTensorMap&& x) : data(std::move(x)) {}
   explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {}
   explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {}
+#ifdef INFRT_WITH_GPU
+  explicit Value(::phi::GPUContext&& x) : data(std::move(x)) {}
+  explicit Value(backends::GpuPhiContext&& x) : data(std::move(x)) {}
+#endif
   explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
+#ifdef INFRT_WITH_TRT
+  explicit Value(::infrt::backends::tensorrt::TrtEngine&& x)
+      : data(std::move(x)) {}
+  explicit Value(::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol x)
+      : data(x) {}
+#endif  // INFRT_WITH_TRT
 #endif
 
   template <typename T>
diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt
index f1cbfba1c46b3..f20344f6f6b84 100644
--- a/paddle/infrt/kernel/CMakeLists.txt
+++ b/paddle/infrt/kernel/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(phi)
+add_subdirectory(tensorrt)
 
 core_gather_headers()
 
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 39ef172fadef9..b27eacf9e522d 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -25,6 +25,16 @@ ::phi::CPUContext CreateCPUContext() {
   return ctx;
 }
 
+#ifdef INFRT_WITH_GPU
+::phi::GPUContext CreateGPUContext() {
+  ::phi::GPUContext context;
+  context.PartialInitWithoutAllocator();
+  context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{});
+  context.PartialInitWithAllocator();
+  return context;
+}
+#endif
+
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
index 3e9580b91da57..ae3f76c8fe536 100644
--- a/paddle/infrt/kernel/phi/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -25,6 +25,10 @@ namespace phi {
 
 ::phi::CPUContext CreateCPUContext();
 
+#ifdef INFRT_WITH_GPU
+::phi::GPUContext CreateGPUContext();
+#endif
+
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index 777fb29ac60d9..c8b1bd8c9ebd2 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -13,8 +13,29 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
+#include "paddle/infrt/common/string.h"
 #include "paddle/infrt/dialect/phi/data_type.h"
 #include "paddle/infrt/kernel/phi/context_kernels.h"
+#include "paddle/infrt/paddle/model_parser.h"
+#include "paddle/infrt/paddle/scope.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/place.h"
+
+#ifdef INFRT_WITH_GPU
+#include <cuda_runtime.h>
+#endif
+
+namespace paddle {
+namespace platform {
+using DeviceContext = ::phi::DeviceContext;
+}  // namespace platform
+namespace framework {
+using LoDTensor = ::phi::DenseTensor;
+void DeserializeFromStream(std::istream& is,
+                           LoDTensor* tensor,
+                           const platform::DeviceContext& dev_ctx);
+}
+}  // namespace paddle
 
 namespace infrt {
 namespace kernel {
@@ -34,26 +55,83 @@ ::phi::DenseTensor CreateDenseTensor(
                              {}));
 }
 
+::phi::DenseTensor CreateGPUDenseTensor(
+    const ::phi::GPUContext& context,
+    host_context::Attribute<std::vector<int64_t>> dims,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision) {
+  return ::phi::DenseTensor(
+      const_cast<::phi::Allocator*>(&context.GetAllocator()),
+      ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
+                             ::phi::make_ddim(dims.get()),
+                             ConvertLayoutToPhi(layout.get()),
+                             {}));
+}
+
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<float>> value) {
-  auto place = ::phi::CPUPlace();
+  auto place = dense_tensor->place();
   float* a_data = dense_tensor->mutable_data<float>(place);
-  for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
-    a_data[i] = (value.get())[i];
+  if (place.GetType() == ::phi::AllocationType::CPU) {
+    for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
+      a_data[i] = (value.get())[i];
+    }
+  } else if (place.GetType() == ::phi::AllocationType::GPU) {
+#ifdef INFRT_WITH_GPU
+    // TODO(wilber): how to set the stream parameter to copy with stream.
+    cudaMemcpy(a_data,
+               value.get().data(),
+               sizeof(float) * value.get().size(),
+               cudaMemcpyHostToDevice);
+#endif
+  } else {
+    llvm_unreachable("temporarily not support other target.");
   }
 }
 
 void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
-#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)              \
-  case ::phi::DataType::PHI_DATATYPE: {                   \
-    DTYPE* data = dense_tensor->data<DTYPE>();            \
-    if (dense_tensor->numel() == 0) break;                \
-    std::cout << data[0];                                 \
-    for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
-      std::cout << "," << data[i];                        \
-    }                                                     \
-    break;                                                \
+#ifndef INFRT_WITH_GPU
+#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)                \
+  case ::phi::DataType::PHI_DATATYPE: {                     \
+    auto place = dense_tensor->place();                     \
+    if (place.GetType() == ::phi::AllocationType::CPU) {    \
+      DTYPE* data = dense_tensor->data<DTYPE>();            \
+      if (dense_tensor->numel() == 0) break;                \
+      std::cout << data[0];                                 \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
+        std::cout << "," << data[i];                        \
+      }                                                     \
+    }                                                       \
+    break;                                                  \
+  }
+#else
+#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)                     \
+  case ::phi::DataType::PHI_DATATYPE: {                          \
+    auto place = dense_tensor->place();                          \
+    DTYPE* data = dense_tensor->data<DTYPE>();                   \
+    if (dense_tensor->numel() == 0) break;                       \
+    if (place.GetType() == ::phi::AllocationType::CPU) {         \
+      std::cout << data[0];                                      \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) {      \
+        std::cout << "," << data[i];                             \
+      }                                                          \
+    } else if (place.GetType() == ::phi::AllocationType::GPU) {  \
+      std::vector<DTYPE> host_data(dense_tensor->numel(), 0);    \
+      cudaMemcpy(host_data.data(),                               \
+                 data,                                           \
+                 sizeof(DTYPE) * dense_tensor->numel(),          \
+                 cudaMemcpyDeviceToHost);                        \
+      std::cout << host_data[0];                                 \
+      for (int64_t i = 1; i < dense_tensor->numel(); i++) {      \
+        std::cout << "," << host_data[i];                        \
+      }                                                          \
+    } else {                                                     \
+      llvm_unreachable("temporarily not support other target."); \
+    }                                                            \
+    break;                                                       \
   }
+#endif
 
   ::phi::DDim dims = dense_tensor->dims();
   std::cout << "dense_tensor: shape=shape" << dims.to_str() << ","
@@ -67,6 +145,89 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
   std::cout << "]\n";
 #undef PRINT_META_DATA
 }
+
+::infrt::phi::DenseTensorMap LoadParams(
+    host_context::Attribute<std::string> path) {
+  const auto& file_path = path.get();
+  std::cout << "loading params from: " << file_path << std::endl;
+  ::infrt::phi::DenseTensorMap map;
+
+  const std::string model_path = file_path + "/__model__";
+  auto pb_proto_prog = paddle::LoadProgram(model_path);
+  auto main_block = pb_proto_prog->blocks(0);
+
+  for (auto& var : main_block.vars()) {
+    if (var.name() == "feed" || var.name() == "fetch" || !var.persistable())
+      continue;
+    std::string param_path = file_path + "/" + var.name();
+    std::ifstream param_file(param_path, std::ios::binary);
+    switch (var.type().type()) {
+      case ::paddle::framework::proto::VarType_Type_LOD_TENSOR: {
+        std::unique_ptr<::phi::DenseTensor> tensor{
+            std::make_unique<::phi::DenseTensor>()};
+        ::phi::CPUContext ctx;
+        ::paddle::framework::DeserializeFromStream(
+            param_file, tensor.get(), ctx);
+        map.SetDenseTensor(var.name(), std::move(tensor));
+      } break;
+      default: {
+        LOG(WARNING) << "Var `" << var.name() << "` type `"
+                     << static_cast<int>(var.type().type())
+                     << "` has not been supported now.";
+      }
+    }
+  }
+  return map;
+}
+
+::infrt::phi::DenseTensorMap LoadCombinedParams(
+    host_context::Attribute<std::string> model_path,
+    host_context::Attribute<std::string> params_path) {
+  const auto& model = model_path.get();
+  std::cout << "loading params from: " << model << std::endl;
+  ::infrt::phi::DenseTensorMap map;
+
+  auto pb_proto_prog = paddle::LoadProgram(model);
+  auto main_block = pb_proto_prog->blocks(0);
+
+  std::ifstream param_file(params_path.get(), std::ios::binary);
+
+  std::set<std::string> tmp;
+  for (auto& var : main_block.vars()) {
+    if (var.name() == "feed" || var.name() == "fetch" || !var.persistable()) {
+      continue;
+    }
+    if (var.type().type() ==
+        ::paddle::framework::proto::VarType_Type_LOD_TENSOR) {
+      tmp.emplace(var.name());
+    } else {
+      llvm_unreachable("the tensor type is illegal.");
+    }
+  }
+
+  for (auto& var : tmp) {
+    std::unique_ptr<::phi::DenseTensor> tensor{
+        std::make_unique<::phi::DenseTensor>()};
+    ::phi::CPUContext ctx;
+    ::paddle::framework::DeserializeFromStream(param_file, tensor.get(), ctx);
+    map.SetDenseTensor(var, std::move(tensor));
+  }
+
+  return map;
+}
+
+::phi::DenseTensor TensorMapGetTensor(
+    const ::infrt::phi::DenseTensorMap& map,
+    host_context::Attribute<std::string> name) {
+  auto* tensor = map.GetDenseTensor(name.get());
+  CHECK(tensor);
+  return *tensor;
+}
+
+int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map) {
+  return map.size();
+}
+
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 8cc0e39e0e443..6cfcc6f91be05 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -17,6 +17,7 @@
 #include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/dialect/infrt/common/types.h"
 #include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/tensor/phi/tensor_map.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
@@ -30,10 +31,30 @@ ::phi::DenseTensor CreateDenseTensor(
     host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<::infrt::PrecisionType> precision);
 
+::phi::DenseTensor CreateGPUDenseTensor(
+    const ::phi::GPUContext& context,
+    host_context::Attribute<std::vector<int64_t>> dims,
+    host_context::Attribute<std::vector<int64_t>> lod,
+    host_context::Attribute<::infrt::LayoutType> layout,
+    host_context::Attribute<::infrt::PrecisionType> precision);
+
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
                         host_context::Attribute<std::vector<float>> values);
 void PrintDenseTensor(::phi::DenseTensor* dense_tensor);
 
+infrt::phi::DenseTensorMap LoadParams(
+    host_context::Attribute<std::string> path);
+
+::phi::DenseTensor TensorMapGetTensor(
+    const ::infrt::phi::DenseTensorMap& map,
+    host_context::Attribute<std::string> name);
+
+::infrt::phi::DenseTensorMap LoadCombinedParams(
+    host_context::Attribute<std::string> model_path,
+    host_context::Attribute<std::string> params_path);
+
+int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map);
+
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index 08c2e19deddfe..5a314817c2420 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -37,15 +37,16 @@ TEST(utils, registry) {
   CHECK_EQ(count, 2U);
 }
 
-class FancyAllocator : public phi::Allocator {
+class FancyAllocator : public ::phi::Allocator {
  public:
-  static void Delete(phi::Allocation* allocation) {
+  static void Delete(::phi::Allocation* allocation) {
     ::operator delete(allocation->ptr());
   }
 
   AllocationPtr Allocate(size_t bytes_size) override {
     void* data = ::operator new(bytes_size);
-    auto* allocation = new phi::Allocation(data, bytes_size, phi::CPUPlace());
+    auto* allocation =
+        new ::phi::Allocation(data, bytes_size, ::phi::CPUPlace());
     return AllocationPtr(allocation, Delete);
   }
 };
@@ -56,20 +57,20 @@ TEST(ElementwiseAdd, launcher_registry) {
   ASSERT_GE(registry.size(), 1UL);
   auto creator = registry.GetKernel("phi_cpu.add.float32.any");
 
-  const phi::DDim dims({1, 2});
-  const phi::DataType dtype{phi::DataType::FLOAT32};
-  const phi::DataLayout layout{phi::DataLayout::NHWC};
-  const phi::LoD lod{};
-  phi::DenseTensorMeta meta(dtype, dims, layout, lod);
+  const ::phi::DDim dims({1, 2});
+  const ::phi::DataType dtype{::phi::DataType::FLOAT32};
+  const ::phi::DataLayout layout{::phi::DataLayout::NHWC};
+  const ::phi::LoD lod{};
+  ::phi::DenseTensorMeta meta(dtype, dims, layout, lod);
 
-  auto fancy_allocator = std::unique_ptr<phi::Allocator>(new FancyAllocator);
+  auto fancy_allocator = std::unique_ptr<::phi::Allocator>(new FancyAllocator);
   auto* alloc = fancy_allocator.get();
 
-  phi::DenseTensor a(alloc, meta);
-  phi::DenseTensor b(alloc, meta);
-  phi::DenseTensor c(alloc, meta);
+  ::phi::DenseTensor a(alloc, meta);
+  ::phi::DenseTensor b(alloc, meta);
+  ::phi::DenseTensor c(alloc, meta);
 
-  auto place = phi::CPUPlace();
+  auto place = ::phi::CPUPlace();
   float* a_data = a.mutable_data<float>(place);
   float* b_data = b.mutable_data<float>(place);
   float* c_data = c.mutable_data<float>(place);
@@ -78,7 +79,7 @@ TEST(ElementwiseAdd, launcher_registry) {
     b_data[i] = 2.f;
   }
 
-  phi::CPUContext context;
+  ::phi::CPUContext context;
   context.SetAllocator(alloc);
   context.Init();
 
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index 0e071418603f8..08683d7cb66ad 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -35,7 +35,7 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
   registry->AddKernel("phi_dt.create_context.cpu",
                       INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext));
   registry->AddKernelWithAttrs(
-      "phi_dt.create_dense_tensor",
+      "phi_dt.create_dense_tensor.cpu",
       INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor),
       {"dims", "lod", "layout", "precision"});
   registry->AddKernelWithAttrs(
@@ -44,6 +44,28 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
       {"value"});
   registry->AddKernel("phi_dt.print_tensor",
                       INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor));
+
+#ifdef INFRT_WITH_GPU
+  registry->AddKernel("phi_dt.create_context.gpu",
+                      INFRT_KERNEL(infrt::kernel::phi::CreateGPUContext));
+  registry->AddKernelWithAttrs(
+      "phi_dt.create_dense_tensor.gpu",
+      INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor),
+      {"dims", "lod", "layout", "precision"});
+#endif
+  registry->AddKernelWithAttrs("phi_dt.load_params",
+                               INFRT_KERNEL(infrt::kernel::phi::LoadParams),
+                               {"path"});
+  registry->AddKernelWithAttrs(
+      "phi_dt.load_combined_params",
+      INFRT_KERNEL(infrt::kernel::phi::LoadCombinedParams),
+      {"model_path", "params_path"});
+  registry->AddKernelWithAttrs(
+      "phi_dt.tensor_map_get_tensor",
+      INFRT_KERNEL(infrt::kernel::phi::TensorMapGetTensor),
+      {"name"});
+  registry->AddKernel("phi_dt.tensor_map_get_size",
+                      INFRT_KERNEL(infrt::kernel::phi::TensorMapGetSize));
 }
 
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index b7503aa4ef358..407ae16c19c49 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -25,6 +25,10 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
+#ifdef INFRT_WITH_PHI
+#include "paddle/phi/core/dense_tensor.h"
+#endif
+
 namespace infrt {
 namespace kernel {
 using namespace host_context;  // NOLINT
@@ -62,6 +66,20 @@ DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute<std::string> name) {
 
 int32_t TensorMapGetSize(TensorMap map) { return map.size(); }
 
+// TODO(wilber): Maybe we should place TensorList type in dt dialect.
+#ifdef INFRT_WITH_PHI
+::phi::DenseTensor TensorListGetTensor(std::vector<::phi::DenseTensor *> list,
+                                       Attribute<int32_t> idx) {
+  CHECK_LT(idx.get(), static_cast<int>(list.size()))
+      << "idx should less than list size";
+  return *list[idx.get()];
+}
+
+int32_t TensorListGetSize(const std::vector<::phi::DenseTensor *> &list) {
+  return list.size();
+}
+#endif
+
 DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; }
 
 template <typename T>
@@ -126,6 +144,14 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) {
                       INFRT_KERNEL(TensorMapGetTensor));
   registry->AddKernel("dt.tensor_map_get_size", INFRT_KERNEL(TensorMapGetSize));
 
+// TensorList related methods.
+#ifdef INFRT_WITH_PHI
+  registry->AddKernelWithAttrs(
+      "dt.tensor_list_get_tensor", INFRT_KERNEL(TensorListGetTensor), {"id"});
+  registry->AddKernel("dt.tensor_list_get_size",
+                      INFRT_KERNEL(TensorListGetSize));
+#endif
+
   registry->AddKernel("dt.shallow_copy_tensor",
                       INFRT_KERNEL(ShallowCopyTensor));
 
diff --git a/paddle/infrt/kernel/tensorrt/CMakeLists.txt b/paddle/infrt/kernel/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000000000..cd35fccbe2aa3
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/CMakeLists.txt
@@ -0,0 +1,10 @@
+if (NOT (INFRT_WITH_PHI AND INFRT_WITH_GPU AND INFRT_WITH_TRT))
+  return()
+endif()
+
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    registry.cc
+    trt_kernels.cc
+)
diff --git a/paddle/infrt/kernel/tensorrt/registry.cc b/paddle/infrt/kernel/tensorrt/registry.cc
new file mode 100644
index 0000000000000..a37e3c0f7f278
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/registry.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensorrt/registry.h"
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+
+namespace infrt {
+namespace kernel {
+
+void RegisterTrtKernels(host_context::KernelRegistry* registry) {
+  registry->AddKernel("trt.create_engine",
+                      INFRT_KERNEL(tensorrt::CreateTrtEngine));
+  registry->AddKernel("trt.inspect_engine",
+                      INFRT_KERNEL(tensorrt::PrintTrtLayer));
+  registry->AddKernel("trt.compute", INFRT_KERNEL(tensorrt::TrtEngineCompute));
+}
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/registry.h b/paddle/infrt/kernel/tensorrt/registry.h
new file mode 100644
index 0000000000000..762329ca61d02
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/registry.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace infrt {
+namespace host_context {
+
+struct KernelRegistry;
+
+}  // namespace host_context
+}  // namespace infrt
+
+namespace infrt {
+namespace kernel {
+
+/**
+ * Register all the trt kernels to registry.
+ */
+void RegisterTrtKernels(host_context::KernelRegistry* registry);
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_helper.h b/paddle/infrt/kernel/tensorrt/trt_helper.h
new file mode 100644
index 0000000000000..96122bffacdb2
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_helper.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <NvInferRuntime.h>
+#include <NvInferRuntimeCommon.h>
+
+#include "glog/logging.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace kernel {
+namespace tensorrt {
+
+static nvinfer1::DataType TensorTypeToWeightType(phi::DataType tensor_type) {
+  switch (tensor_type) {
+    case phi::DataType::FLOAT32:
+      return nvinfer1::DataType::kFLOAT;
+    case phi::DataType::INT32:
+      return nvinfer1::DataType::kINT32;
+    case phi::DataType::FLOAT16:
+      return nvinfer1::DataType::kHALF;
+    default:
+      llvm_unreachable("should not reach here");
+  }
+}
+
+static nvinfer1::Dims ArrayAttrToNvDims(const mlir::ArrayAttr& int_array_attr) {
+  nvinfer1::Dims dims;
+  dims.nbDims = int_array_attr.size();
+  CHECK(!int_array_attr.empty());
+  CHECK(int_array_attr[0].getType().isIntOrIndex());
+  for (int i = 0; i < dims.nbDims; ++i) {
+    dims.d[i] = int_array_attr[i].cast<mlir::IntegerAttr>().getInt();
+  }
+  return dims;
+}
+
+static nvinfer1::Weights TensorToWeights(phi::DenseTensor* tensor) {
+  CHECK_NOTNULL(tensor);
+  nvinfer1::Weights ret;
+  ret.type = TensorTypeToWeightType(tensor->dtype());
+  ret.count = tensor->numel();
+  ret.values = tensor->data();
+  return ret;
+}
+
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
new file mode 100644
index 0000000000000..aa7609092b82c
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+#include <string>
+#include "NvInfer.h"
+#include "NvInferRuntime.h"
+#include "NvInferRuntimeCommon.h"
+#include "glog/logging.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+
+#include "paddle/infrt/kernel/tensorrt/trt_helper.h"
+#include "paddle/infrt/kernel/tensorrt/trt_layers.h"
+
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/infrt/backends/tensorrt/trt_options.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace kernel {
+namespace tensorrt {
+
+::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
+    MlirOperationWithInfrtSymbol create_engine_op) {
+  // TODO(wilber): The device_id needs to get from mlir.
+  int device_id = 0;
+  backends::tensorrt::TrtEngine engine(device_id);
+
+  auto* builder = engine.GetTrtBuilder();
+  // TODO(wilber): How to process weights?
+  backends::tensorrt::TrtUniquePtr<nvinfer1::INetworkDefinition> network;
+  // TODO(wilber): static_shape or dynamic_shape network? The code is just
+  // static_shape test.
+  network.reset(builder->createNetworkV2(0));
+
+  // TODO(wilber): The build option shoule be fiiled from mlir info.
+  backends::tensorrt::BuildOptions options;
+  options.max_batch = 4;
+  options.workspace = 1024;
+
+  // Parse mlir Region which only has one block.
+  mlir::Operation& operation = *create_engine_op.operation;
+  auto* symbol_table = create_engine_op.symbol_table;
+  CHECK_NOTNULL(symbol_table);
+
+  unsigned int num_regions = operation.getNumRegions();
+  CHECK_EQ(num_regions, 1U) << "only support one region case.";
+  auto& region = operation.getRegion(0);
+  auto& block = region.getBlocks().front();
+
+  std::unordered_map<std::string, phi::DenseTensor*> trt_bind_inputs;
+  ValueToITensorMap value_to_trt_tensor_map;
+  ValueToTensorMap value_to_tensor_map;
+
+  for (auto index_operand : llvm::enumerate(operation.getOperands())) {
+    mlir::Value operand = index_operand.value();
+    size_t idx = index_operand.index();
+
+    const std::string input_name = "input_" + std::to_string(idx);
+    auto* v = symbol_table->GetValue(std::to_string(idx));
+    CHECK_NOTNULL(v);
+    auto* t = &v->get<phi::DenseTensor>();
+    value_to_tensor_map[operand] = t;
+
+    // TODO(wilber): get input info from mlir.
+
+    // TODO(wilber): input dims, now only support static_shape, and just remove
+    // the first dimension. If the first dim is not -1, maybe we can pass the
+    // origin dims.
+
+    // TODO(wilber): now only suppot float input.
+
+    if (operand.isa<mlir::BlockArgument>()) {
+      // TODO(wilber): A trick: the weights are CPU tensor and inputs are GPU
+      // tensor, so we treat all GPU tensors as inputs to trt.
+      if (t->place().GetType() == phi::AllocationType::GPU) {
+        trt_bind_inputs[input_name] = t;
+        nvinfer1::Dims dims;
+        dims.nbDims = t->dims().size() - 1;
+        for (int i = 0; i < dims.nbDims; ++i) {
+          dims.d[i] = t->dims()[i + 1];
+        }
+        auto* in = network->addInput(
+            input_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
+        value_to_trt_tensor_map[operand] = in;
+      }
+    } else {
+      // TODO(wilber): Replace with the op name that generates the weights.
+      if (operand.getDefiningOp()->getName().getStringRef() !=
+          "phi_dt.create_dense_tensor.cpu") {
+        trt_bind_inputs[input_name] = t;
+        nvinfer1::Dims dims;
+        dims.nbDims = t->dims().size() - 1;
+        for (int i = 0; i < dims.nbDims; ++i) {
+          dims.d[i] = t->dims()[i + 1];
+        }
+        auto* in = network->addInput(
+            input_name.c_str(), nvinfer1::DataType::kFLOAT, dims);
+        value_to_trt_tensor_map[operand] = in;
+      }
+    }
+  }
+
+  // TODO(wilber): Find a way to add layer.
+  for (auto& operation : block.without_terminator()) {
+    if (trt::ActivationOp op = llvm::dyn_cast<trt::ActivationOp>(operation)) {
+      ActivationFunc(
+          op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
+    } else if (trt::FullyConnectedOp op =
+                   llvm::dyn_cast<trt::FullyConnectedOp>(operation)) {
+      FcFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
+    } else if (trt::ConvolutionOp op =
+                   llvm::dyn_cast<trt::ConvolutionOp>(operation)) {
+      ConvFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map);
+    } else {
+      CHECK(false) << "not supported operation.";
+    }
+  }
+
+  for (auto index_operand :
+       llvm::enumerate(block.getTerminator()->getOperands())) {
+    mlir::Value arg = index_operand.value();
+    CHECK(value_to_trt_tensor_map.count(arg));
+    // TODO(wilber): A trick that we name trt output tensor's name as output_0,
+    // output_1, ...
+    value_to_trt_tensor_map[arg]->setName(
+        ("output_" + std::to_string(index_operand.index())).c_str());
+    network->markOutput(*value_to_trt_tensor_map[arg]);
+  }
+  for (int i = 0; i < network->getNbOutputs(); ++i) {
+    engine.PrepareOutputHandle(network->getOutput(i)->getName());
+  }
+
+  VLOG(3) << "trt engine build start.";
+  engine.Build(std::move(network), options);
+  VLOG(3) << "trt engine build done.";
+
+  // TODO(wilber): get inference options from mlir.
+  backends::tensorrt::InferenceOptions inference_options;
+  inference_options.batch = 1;
+  // TODO(wilber): bind trt input/output tensors.
+  engine.SetUpInference(inference_options, trt_bind_inputs);
+  return engine;
+}
+
+void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) {
+  engine->GetEngineInfo();
+}
+
+std::vector<phi::DenseTensor*> TrtEngineCompute(
+    backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) {
+  engine->Run(context);
+  std::vector<phi::DenseTensor*> res;
+  for (size_t i = 0; i < engine->GetOutputNum(); ++i) {
+    res.push_back(engine->GetOutput("output_" + std::to_string(i)));
+  }
+  return res;
+}
+
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h
new file mode 100644
index 0000000000000..546ee9dc78852
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "mlir/IR/Operation.h"
+
+#include "paddle/infrt/backends/tensorrt/trt_engine.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+
+namespace infrt {
+namespace host_context {
+class SymbolTable;
+}  // namespace host_context
+
+namespace kernel {
+namespace tensorrt {
+
+struct MlirOperationWithInfrtSymbol {
+  mlir::Operation* operation;
+  ::infrt::host_context::SymbolTable* symbol_table;
+};
+
+::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
+    MlirOperationWithInfrtSymbol engine_op);
+
+void PrintTrtLayer(backends::tensorrt::TrtEngine* engine);
+
+std::vector<phi::DenseTensor*> TrtEngineCompute(
+    backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context);
+
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensorrt/trt_layers.h b/paddle/infrt/kernel/tensorrt/trt_layers.h
new file mode 100644
index 0000000000000..19e20c170ec83
--- /dev/null
+++ b/paddle/infrt/kernel/tensorrt/trt_layers.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <mlir/IR/Operation.h>
+
+#include <string>
+
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include "paddle/infrt/kernel/tensorrt/trt_helper.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace kernel {
+namespace tensorrt {
+
+using ValueToTensorMap = llvm::DenseMap<mlir::Value, phi::DenseTensor*>;
+using ValueToITensorMap = llvm::DenseMap<mlir::Value, nvinfer1::ITensor*>;
+
+inline void ActivationFunc(
+    trt::ActivationOp& act_op,  // NOLINT
+    nvinfer1::INetworkDefinition* network,
+    ValueToITensorMap& value_to_trt_tensor_map,  // NOLINT
+    ValueToTensorMap& value_to_tensor_map) {     // NOLINT
+  auto in_arg = act_op.getOperand();
+  CHECK(value_to_trt_tensor_map.count(in_arg))
+      << "value_to_trt_tensor_map not has in_arg.";
+
+  nvinfer1::ActivationType act_type =
+      static_cast<nvinfer1::ActivationType>(act_op.activation_type());
+  auto* act_layer =
+      network->addActivation(*value_to_trt_tensor_map[in_arg], act_type);
+  act_layer->setAlpha(act_op.alpha().convertToFloat());
+  act_layer->setBeta(act_op.beta().convertToFloat());
+  for (size_t i = 0; i < act_op->getNumResults(); ++i) {
+    nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i);
+    mlir::Value act_out = act_op->getResult(i);
+    value_to_trt_tensor_map[act_out] = act_out_tensor;
+  }
+}
+
+inline void ConvFunc(trt::ConvolutionOp& op,  // NOLINT
+                     nvinfer1::INetworkDefinition* network,
+                     ValueToITensorMap& value_to_trt_tensor_map,  // NOLINT
+                     ValueToTensorMap& value_to_tensor_map) {     // NOLINT
+  mlir::Value input_tensor_repr = op.input_tensor();
+  int out_channel_num = op.out_channel_num();
+  auto size_attrs = op.kernel_size();
+  nvinfer1::Dims dims = ArrayAttrToNvDims(size_attrs);
+  auto kernel_weights =
+      TensorToWeights(value_to_tensor_map[op.kernel_weights()]);
+  auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]);
+
+  auto* layer =
+      network->addConvolutionNd(*value_to_trt_tensor_map[input_tensor_repr],
+                                out_channel_num,
+                                dims,
+                                kernel_weights,
+                                bias_weights);
+  CHECK_NOTNULL(layer);
+  mlir::Value out_repr = op.output_tensor();
+  nvinfer1::ITensor* out_tensor = layer->getOutput(0);
+  value_to_trt_tensor_map[out_repr] = out_tensor;
+}
+
+inline void FcFunc(trt::FullyConnectedOp& op,  // NOLINT
+                   nvinfer1::INetworkDefinition* network,
+                   ValueToITensorMap& value_to_trt_tensor_map,  // NOLINT
+                   ValueToTensorMap& value_to_tensor_map) {     // NOLINT
+  mlir::Value input_tensor_repr = op.input_tensor();
+  CHECK(value_to_trt_tensor_map.count(input_tensor_repr));
+
+  auto kernel_weights =
+      TensorToWeights(value_to_tensor_map[op.kernel_weights()]);
+  auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]);
+
+  int out_channel_num = op.out_channel_num();
+  auto* layer =
+      network->addFullyConnected(*value_to_trt_tensor_map[input_tensor_repr],
+                                 out_channel_num,
+                                 kernel_weights,
+                                 bias_weights);
+
+  mlir::Value out_repr = op.output_tensor();
+  nvinfer1::ITensor* out_tensor = layer->getOutput(0);
+  value_to_trt_tensor_map[out_repr] = out_tensor;
+}
+}  // namespace tensorrt
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/CMakeLists.txt b/paddle/infrt/tensor/CMakeLists.txt
index 95b2e8f683926..95d4090a9a3f7 100644
--- a/paddle/infrt/tensor/CMakeLists.txt
+++ b/paddle/infrt/tensor/CMakeLists.txt
@@ -1,5 +1,7 @@
 core_gather_headers()
 
+add_subdirectory(phi)
+
 gather_srcs(infrt_src SRCS
   tensor_map.cc
   tensor_metadata.cc
diff --git a/paddle/infrt/tensor/phi/CMakeLists.txt b/paddle/infrt/tensor/phi/CMakeLists.txt
new file mode 100644
index 0000000000000..97e26661266e9
--- /dev/null
+++ b/paddle/infrt/tensor/phi/CMakeLists.txt
@@ -0,0 +1,3 @@
+gather_srcs(infrt_src SRCS
+  tensor_map.cc
+)
diff --git a/paddle/infrt/tensor/phi/tensor_map.cc b/paddle/infrt/tensor/phi/tensor_map.cc
new file mode 100644
index 0000000000000..7690322aed4a3
--- /dev/null
+++ b/paddle/infrt/tensor/phi/tensor_map.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/phi/tensor_map.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace infrt {
+namespace phi {
+
+void DenseTensorMap::SetDenseTensor(
+    const std::string& name, std::unique_ptr<::phi::DenseTensor>&& tensor) {
+  std::lock_guard<std::mutex> lock(mu_);
+  auto it = map_.emplace(std::make_pair(name, std::move(tensor)));
+  if (!it.second) {
+    llvm_unreachable("dense tensor map insert failed.");
+  }
+}
+
+::phi::DenseTensor* DenseTensorMap::GetDenseTensor(
+    const std::string& name) const {
+  std::lock_guard<std::mutex> lock(mu_);
+  auto it = map_.find(name);
+  if (it != map_.end()) {
+    return it->second.get();
+  }
+  LOG(WARNING) << "can not find `" << name << "` in the tensor map.";
+  return nullptr;
+}
+
+size_t DenseTensorMap::size() const {
+  std::lock_guard<std::mutex> lock(mu_);
+  return map_.size();
+}
+
+}  // namespace phi
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/phi/tensor_map.h b/paddle/infrt/tensor/phi/tensor_map.h
new file mode 100644
index 0000000000000..1b9fbdd9defc7
--- /dev/null
+++ b/paddle/infrt/tensor/phi/tensor_map.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace infrt {
+namespace phi {
+
+class DenseTensorMap {
+ public:
+  DenseTensorMap() = default;
+  DenseTensorMap(DenseTensorMap&& other) : map_(std::move(other.map_)) {}
+  void SetDenseTensor(const std::string& name,
+                      std::unique_ptr<::phi::DenseTensor>&& tensor);
+  ::phi::DenseTensor* GetDenseTensor(const std::string& name) const;
+  size_t size() const;
+
+ private:
+  mutable std::mutex mu_;
+  std::unordered_map<std::string, std::unique_ptr<::phi::DenseTensor>> map_;
+};
+
+}  // namespace phi
+}  // namespace infrt
diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt
index 5ce6d8673421b..6f839cdc39549 100644
--- a/paddle/infrt/tests/CMakeLists.txt
+++ b/paddle/infrt/tests/CMakeLists.txt
@@ -1,6 +1,9 @@
+cc_test_tiny(test_abs_model SRCS model/test_abs.cc DEPS infrt ${MLIR_IR_LIBS})
+
 configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py")
 
 add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\""
-    DEPENDS infrtopt infrtexec phi-ir-exec)
+    DEPENDS infrtopt infrtexec)
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/phi/linear_cpu.mlir)
diff --git a/paddle/infrt/tests/dialect/rewrite.mlir b/paddle/infrt/tests/dialect/pd/rewrite.mlir
similarity index 97%
rename from paddle/infrt/tests/dialect/rewrite.mlir
rename to paddle/infrt/tests/dialect/pd/rewrite.mlir
index 9fbb09e22449f..ea0248b9d95d2 100644
--- a/paddle/infrt/tests/dialect/rewrite.mlir
+++ b/paddle/infrt/tests/dialect/pd/rewrite.mlir
@@ -1,4 +1,4 @@
-// RUN: infrtopt --canonicalize %s | FileCheck %s
+// RUN: infrtopt --pd-op-fuse %s | FileCheck %s
 // CHECK-LABEL: @main
 func @main() -> tensor<?xf32> {
   %a = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
index 3657777a5b0bc..b8cb1a5cec2a1 100644
--- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL: @sign_any_float32_execute
 func @sign_any_float32_execute() {
   %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
-  %t = "phi_dt.create_dense_tensor" (%ctx) {
+  %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {
     precision=#infrt.precision<FP32>, 
     layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
   "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
diff --git a/paddle/infrt/tests/dialect/phi/linear_cpu.mlir.in b/paddle/infrt/tests/dialect/phi/linear_cpu.mlir.in
new file mode 100644
index 0000000000000..7ca33fa10a90d
--- /dev/null
+++ b/paddle/infrt/tests/dialect/phi/linear_cpu.mlir.in
@@ -0,0 +1,19 @@
+// RUN: infrtexec -i %s
+module  {
+  func @main_graph(%arg0: !phi.dense_tensor_map, %arg1: !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW> {
+    %0 = phi_dt.tensor_map_get_tensor(%arg0) {name = "linear_0.w_0"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    %1 = phi_dt.tensor_map_get_tensor(%arg0) {name = "linear_0.b_0"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    %2 = "phi_dt.create_context.cpu"() : () -> !phi.context<CPU>
+    %5 = "phi_cpu.matmul.float32.any"(%2, %arg1, %0) {trans_x = false, trans_y = false} : (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    %7 = "phi_cpu.add.float32.any"(%2, %5, %1): (!phi.context<CPU>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    infrt.return %7 : !infrt.dense_tensor<CPU, FP32, NCHW>
+  }
+  func @main() {
+    %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+    %1 = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[16:i64, 784:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    %map = phi_dt.load_combined_params(){model_path="@CMAKE_BINARY_DIR@/linear/linear.pdmodel",params_path="@CMAKE_BINARY_DIR@/linear/linear.pdiparams"}
+    %2 = infrt.call@main_graph(%map, %1) : (!phi.dense_tensor_map, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+    phi_dt.print_tensor (%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
+    infrt.return
+  }
+}
diff --git a/paddle/infrt/tests/dialect/phi/phi_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
index 61a66cb3d71a3..47badd97d37db 100644
--- a/paddle/infrt/tests/dialect/phi/phi_pass.mlir
+++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
@@ -1,4 +1,5 @@
-// RUN: phi-ir-exec %s
+// RUN: infrtopt -phi-op-convert -infrt-op-fuse %s
+
 // CHECK-LABEL: @ops
 func @ops() {
   %a = pd.feed() {name="input0"} : !infrt.lod_tensor<?xf32,0>
@@ -8,3 +9,10 @@ func @ops() {
   %h = "pd.abs"(%g):(tensor<?xf32>) -> tensor<?xf32>
   "pd.fetch"(%h) {name="output"} :(tensor<?xf32>)->()
 }
+
+// CHECK-LABEL: @op_execute
+func @op_execute(%a:!infrt.lod_tensor<?xf32,0>, %b:!infrt.lod_tensor<?xf32,0>, %c:!infrt.lod_tensor<?xf32,0>)  -> !infrt.lod_tensor<?xf32,0> {
+  %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor<?xf32,0>, !infrt.lod_tensor<?xf32>) -> tensor<?xf32>
+  %h = "pd.abs"(%g):(tensor<?xf32>) -> tensor<?xf32>
+  "pd.fetch"(%h) {name="output"} :(tensor<?xf32>)->()
+}
diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir
index 5b0fa735897a3..21ee8ebf0b705 100644
--- a/paddle/infrt/tests/dialect/phi/phi_test.mlir
+++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir
@@ -6,7 +6,7 @@ module  {
   }
   func @main() {
     %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
-    %t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+    %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision<FP32>, layout=#infrt.layout<NCHW>, lod=[1:i64], dims=[1:i64]}: (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
     "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
     %2 = infrt.call@predict(%t) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
     phi_dt.print_tensor(%2 : !infrt.dense_tensor<CPU, FP32, NCHW>)
diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
index 7aeb3f8a4d051..9e3773edd77b0 100644
--- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
+++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in
@@ -12,3 +12,30 @@ func @load_tensor_map() {
 
   infrt.return
 }
+
+func @load_phi_tensor_map() {
+  %map = phi_dt.load_params(){path="@CMAKE_BINARY_DIR@/multi_fc_model"}
+  %size = phi_dt.tensor_map_get_size(%map) -> i32
+  infrt.print.i32 %size
+
+  %a = phi_dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
+
+  // CHECK: dense_tensor: shape=shape[2], value=[0,0]
+  phi_dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
+
+  infrt.return
+}
+
+func @load_combined_phi_tensor_map() {
+  %map = phi_dt.load_combined_params(){model_path="@CMAKE_BINARY_DIR@/multi_fc_model/fc.pdmodel",
+    params_path="@CMAKE_BINARY_DIR@/multi_fc_model/fc.pdiparams"}
+  %size = phi_dt.tensor_map_get_size(%map) -> i32
+  infrt.print.i32 %size
+
+  %a = phi_dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor<CPU, FP32, NCHW>
+
+  // CHECK: dense_tensor: shape=shape[2], value=[0,0]
+  phi_dt.print_tensor (%a : !infrt.dense_tensor<CPU, FP32, NCHW>)
+
+  infrt.return
+}
diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir
new file mode 100644
index 0000000000000..ef86dcf1e72a0
--- /dev/null
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir
@@ -0,0 +1,37 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @run_trt
+func @run_trt(%0 : !infrt.dense_tensor<GPU, FP32, NCHW>, %ctx : !phi.context<GPU>) {
+  %a = "trt.create_engine"(%0) ({
+    %1 = "trt.Activation"(%0) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    "infrt.return"(%1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !trt.engine
+  "trt.inspect_engine"(%a) {} : (!trt.engine) -> ()
+
+  %res = "trt.compute"(%a, %ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
+  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
+  "infrt.print.i32"(%size) {} : (i32) -> ()
+
+  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  infrt.return
+}
+
+// CHECK-LABEL: @main
+func @main() {
+  %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
+  %t = "phi_dt.create_dense_tensor.gpu" (%ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+
+  "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  "phi_dt.print_tensor" (%t) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  //%res = 
+  infrt.call @run_trt(%t, %ctx) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !phi.context<GPU>) -> ()
+  //-> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+
+  infrt.return
+}
diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir
new file mode 100644
index 0000000000000..c67d47415bfb0
--- /dev/null
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir
@@ -0,0 +1,54 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @run_trt
+func @run_trt(%input_tensor : !infrt.dense_tensor<GPU, FP32, NCHW>, %kernel_weight : !infrt.dense_tensor<CPU, FP32, NCHW>, %kernel_bias : !infrt.dense_tensor<CPU, FP32, NCHW>, %gpu_ctx : !phi.context<GPU>) {
+  %a = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({
+    %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %2 = "trt.Convolution"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 3 : si32, kernel_size = [3:i32, 3:i32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    "infrt.return"(%1, %2) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !trt.engine
+  "trt.inspect_engine"(%a) {} : (!trt.engine) -> ()
+
+  %res = "trt.compute"(%a, %gpu_ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
+  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
+  "infrt.print.i32"(%size) {} : (i32) -> ()
+
+  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  infrt.return
+}
+
+// CHECK-LABEL: @main
+func @main() {
+  %gpu_ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
+  %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+
+  %input_tensor = "phi_dt.create_dense_tensor.gpu" (%gpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[1:i64, 3:i64, 28:i64, 28:i64], lod=[0:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  // "phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[3:i64, 3:i64, 3:i64, 3:i64], lod=[0:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  // "phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[3:i64], lod=[0:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  // "phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  infrt.call @run_trt(%input_tensor, %kernel_weight, %kernel_bias, %gpu_ctx) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !phi.context<GPU>) -> ()
+
+  infrt.return
+}
diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir
new file mode 100644
index 0000000000000..78dc4ac1c1093
--- /dev/null
+++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir
@@ -0,0 +1,46 @@
+// RUN: infrtexec -i %s | FileCheck %s
+
+// CHECK-LABEL: @main
+func @main() {
+  %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context<GPU>
+  %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context<CPU>
+
+  %input_tensor = "phi_dt.create_dense_tensor.gpu" (%ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context<GPU>) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  //"phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[2:i64, 3:i64], lod=[1:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  //"phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) {
+    precision=#infrt.precision<FP32>,
+    layout=#infrt.layout<NCHW>,
+    dims=[2:i64], lod=[1:i64]} : (!phi.context<CPU>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32, 2.:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  //"phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+
+  %engine = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({
+    %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    %2 = "trt.FullyConnected"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 2 : si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+    "infrt.return"(%1, %2) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+  }) : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !trt.engine
+
+  %res = "trt.compute"(%engine, %ctx) {} : (!trt.engine, !phi.context<GPU>) -> (!infrt.tensor_list)
+  %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32)
+  "infrt.print.i32"(%size) {} : (i32) -> ()
+
+  %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor<GPU, FP32, NCHW>)
+  "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> ()
+
+  infrt.return
+}
diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir
index e3cb9670bec01..7bdf62a277896 100644
--- a/paddle/infrt/tests/dialect/trt_ops.mlir
+++ b/paddle/infrt/tests/dialect/trt_ops.mlir
@@ -1,16 +1,16 @@
 // RUN: trt-exec %s
 // CHECK-LABEL: @main
-func @main(%bias:tensor<?xf32>, %c:tensor<?xf32>, %b1:tensor<?xf32>, %b2:tensor<?xf32>, %bias1:tensor<?xf32>, %bias2:tensor<?xf32>) -> tensor<?xf32> {
-  %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
+func @main(%bias:!infrt.dense_tensor<GPU, FP32, NCHW>, %c:!infrt.dense_tensor<GPU, FP32, NCHW>, %b1:!infrt.dense_tensor<GPU, FP32, NCHW>, %b2:!infrt.dense_tensor<GPU, FP32, NCHW>, %bias1:!infrt.dense_tensor<GPU, FP32, NCHW>, %bias2:!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW> {
+  %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e = "pd.relu6"(%d) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
 
-  %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e1 = "pd.relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
+  %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e1 = "pd.relu"(%d1) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
 
-  %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
+  %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (!infrt.dense_tensor<GPU, FP32, NCHW>, !infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
+  %e2 = "pd.relu"(%d2) {} : (!infrt.dense_tensor<GPU, FP32, NCHW>) -> !infrt.dense_tensor<GPU, FP32, NCHW>
   
-  infrt.return %e2 : tensor<?xf32>
+  infrt.return %e2 : !infrt.dense_tensor<GPU, FP32, NCHW>
 }
diff --git a/paddle/infrt/tests/model/abs_model.py b/paddle/infrt/tests/model/abs_model.py
new file mode 100644
index 0000000000000..dd1632bc9d4d8
--- /dev/null
+++ b/paddle/infrt/tests/model/abs_model.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.nn import Layer
+from paddle.static import InputSpec
+from paddle.jit import to_static
+import sys
+
+
+class AbsNet(paddle.nn.Layer):
+    def __init__(self):
+        super(AbsNet, self).__init__()
+
+    def forward(self, x):
+        x = paddle.abs(x)
+        return x
+
+
+if __name__ == '__main__':
+    # build network
+    model = AbsNet()
+    # save inferencing format model
+    net = to_static(
+        model, input_spec=[InputSpec(
+            shape=[None, 1, 28, 28], name='x')])
+    paddle.jit.save(net, sys.argv[1])
diff --git a/paddle/infrt/tests/model/linear.py b/paddle/infrt/tests/model/linear.py
new file mode 100644
index 0000000000000..602e067365b87
--- /dev/null
+++ b/paddle/infrt/tests/model/linear.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# example 1: save layer
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as opt
+
+BATCH_SIZE = 16
+BATCH_NUM = 4
+EPOCH_NUM = 4
+
+IMAGE_SIZE = 784
+CLASS_NUM = 10
+
+
+# define a random dataset
+class RandomDataset(paddle.io.Dataset):
+    def __init__(self, num_samples):
+        self.num_samples = num_samples
+
+    def __getitem__(self, idx):
+        image = np.random.random([IMAGE_SIZE]).astype('float32')
+        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+        return image, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class LinearNet(nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        return self._linear(x)
+
+
+def train(layer, loader, loss_fn, opt):
+    for epoch_id in range(EPOCH_NUM):
+        for batch_id, (image, label) in enumerate(loader()):
+            out = layer(image)
+            loss = loss_fn(out, label)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+
+
+# 1. train & save model.
+
+# create network
+layer = LinearNet()
+loss_fn = nn.CrossEntropyLoss()
+adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
+
+# create data loader
+dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+loader = paddle.io.DataLoader(
+    dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2)
+
+# train
+train(layer, loader, loss_fn, adam)
+
+# save
+path = "linear/linear"
+paddle.jit.save(layer, path)
diff --git a/paddle/infrt/tests/model/test_abs.cc b/paddle/infrt/tests/model/test_abs.cc
new file mode 100644
index 0000000000000..49266910dbd27
--- /dev/null
+++ b/paddle/infrt/tests/model/test_abs.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
+#include <iostream>
+#include <string>
+
+#include "llvm/Support/DynamicLibrary.h"
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+#include "paddle/infrt/kernel/basic_kernels.h"
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
+#include "paddle/infrt/kernel/phi/registry.h"
+#include "paddle/infrt/kernel/tensor_kernels.h"
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+#include "paddle/infrt/kernel/test_kernels.h"
+
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
+
+#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
+#include "paddle/infrt/host_context/paddle_mlir.h"
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+
+static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
+    "shared_libs",
+    llvm::cl::desc("Specify shared library with kernels."),
+    llvm::cl::ZeroOrMore,
+    llvm::cl::MiscFlags::CommaSeparated);
+
+TEST(ABS_MODEL, convert_and_execute) {
+  std::string model_file_name = "./abs.pdmodel";
+  std::string params_file_name = "./abs.pdiparams";
+  // convert model
+  MLIRModelGenImpl myGen;
+  auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name);
+  module_.dump();
+  // pick kernel
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  context->allowUnregisteredDialects();
+  context->getOrLoadDialect<mlir::StandardOpsDialect>();
+
+  context->getOrLoadDialect<infrt::InfrtDialect>();
+  context->getOrLoadDialect<infrt::ts::TensorShapeDialect>();
+  context->getOrLoadDialect<infrt::InfrtDialect>();
+  context->getOrLoadDialect<infrt::dt::DTDialect>();
+  context->getOrLoadDialect<infrt::pd::PaddleDialect>();
+
+  context->getOrLoadDialect<infrt::phi::PHIDenseTensorDialect>();
+  context->getOrLoadDialect<infrt::phi::PHICPUKernelDialect>();
+  context->getOrLoadDialect<infrt::phi::PHIGPUKernelDialect>();
+  context->getOrLoadDialect<infrt::phi::PHIDialect>();
+
+  context->loadAllAvailableDialects();
+  mlir::PassManager pm(context);
+
+  mlir::OpPassManager& phi_pass_manager = pm.nest<mlir::FuncOp>();
+  std::vector<infrt::Place> valid_places = {{infrt::TargetType::CPU,
+                                             infrt::PrecisionType::FLOAT32,
+                                             infrt::LayoutType::NCHW}};
+  phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places));
+  phi_pass_manager.addPass(infrt::createInfrtOpFusePass());
+
+  if (mlir::failed(pm.run(module_))) {
+    std::cout << "\npass failed!\n" << std::endl;
+  }
+  module_.dump();
+
+  // executate
+  infrt::host_context::KernelRegistry registry;
+  infrt::kernel::RegisterBasicKernels(&registry);
+  infrt::kernel::RegisterTestKernels(&registry);
+  infrt::kernel::RegisterTensorShapeKernels(&registry);
+  infrt::kernel::RegisterTensorKernels(&registry);
+  infrt::kernel::RegisterControlFlowKernels(&registry);
+  infrt::kernel::RegisterPhiKernels(&registry);
+  infrt::kernel::RegisterInferShapeLaunchers(&registry);
+  // load extra shared library
+  for (const auto& lib_path : cl_shared_libs) {
+    std::string err;
+    llvm::sys::DynamicLibrary dynLib =
+        llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err);
+    if (!dynLib.isValid()) {
+      llvm::errs() << "Load shared library failed. Error: " << err << "\n";
+      break;
+    }
+    if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) {
+      auto reg_func =
+          reinterpret_cast<void (*)(infrt::host_context::KernelRegistry*)>(
+              reg_sym);
+      reg_func(&registry);
+    } else {
+      llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path
+                   << "\". Skip.\n";
+    }
+  }
+  infrt::host_context::TestMlir(module_, &registry);
+}
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 7b074d0ebb76d..04e1bbcc9df42 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -25,8 +25,6 @@ add_subdirectory(tests)
 # make an unity target for compile deps
 set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor)
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
-# keep this message for debug, remove it later if needless
-message(STATUS "All standard phi kernels: ${phi_kernels}")
 set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
 cc_library(phi DEPS ${PHI_DEPS})
diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h
new file mode 100644
index 0000000000000..a2983d9c2aa65
--- /dev/null
+++ b/paddle/phi/api/include/context_pool.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>
+
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/macros.h"
+#include "paddle/utils/flat_hash_map.h"
+
+namespace phi {
+class DeviceContext;
+class CPUContext;
+class GPUContext;
+}  // namespace phi
+
+namespace paddle {
+namespace experimental {
+
+template <AllocationType T>
+struct DefaultDeviceContextType;
+
+template <>
+struct DefaultDeviceContextType<AllocationType::CPU> {
+  using TYPE = phi::CPUContext;
+};
+
+template <>
+struct DefaultDeviceContextType<AllocationType::GPU> {
+  using TYPE = phi::GPUContext;
+};
+
+/**
+ * The DeviceContextPool here is just a mirror of the DeviceContextPool in
+ * fluid, and does not manage the life cycle of the DeviceContext.
+ * It is mainly used for external custom operator calls and high-performance
+ * C++ APIs.
+ *
+ * Since DeviceContextPool in fluid is a global singleton, it always exists
+ * in program running, so DeviceContextPool here can always access the correct
+ * DeviceContext pointer.
+ *
+ * In order not to depend on the fluid's DeviceContextPool,
+ * the DeviceContextPool here needs to be initialized in the fluid, and cannot
+ * be initialized by itself.
+ */
+class DeviceContextPool {
+ public:
+  static DeviceContextPool& Instance();
+
+  const phi::DeviceContext* Get(const Place& place);
+
+  phi::DeviceContext* GetMutable(const Place& place);
+
+  template <AllocationType T>
+  const typename DefaultDeviceContextType<T>::TYPE* Get(const Place& place) {
+    return reinterpret_cast<const typename DefaultDeviceContextType<T>::TYPE*>(
+        Get(place));
+  }
+
+ private:
+  DeviceContextPool() = default;
+
+  paddle::flat_hash_map<Place, const phi::DeviceContext*, Place::Hash>
+      context_map_;
+  std::mutex mutex_;
+
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index c268742fa567b..6fab6643f398d 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -31,7 +31,6 @@ using gpuStream_t = hipStream_t;
 
 #include "paddle/phi/api/ext/dll_decl.h"
 #include "paddle/phi/api/ext/place.h"
-#include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
@@ -225,6 +224,22 @@ class PADDLE_API Tensor final {
    */
   bool is_selected_rows() const;
 
+  /**
+   * @brief Determine whether tensor is SparseCooTensor
+   *
+   * @return true
+   * @return false
+   */
+  bool is_sparse_coo_tensor() const;
+
+  /**
+   * @brief Determine whether tensor is SparseCsrTensor
+   *
+   * @return true
+   * @return false
+   */
+  bool is_sparse_csr_tensor() const;
+
   /* Part 3: Device and Backend methods */
 
   /**
@@ -253,12 +268,20 @@ class PADDLE_API Tensor final {
   bool is_cpu() const;
 
   /**
-   * @brief Determine whether the tensor device is CUDA
+   * @brief Determine whether the tensor device is GPU
+   *
+   * @return true
+   * @return false
+   */
+  bool is_gpu() const;
+
+  /**
+   * @brief Determine whether the tensor device is GPU_PINNED
    *
    * @return true
    * @return false
    */
-  bool is_cuda() const;
+  bool is_gpu_pinned() const;
 
   /* Part 4: Data Access methods */
 
@@ -324,7 +347,7 @@ class PADDLE_API Tensor final {
    *
    * @return std::shared_ptr<phi::TensorBase>
    */
-  std::shared_ptr<phi::TensorBase> impl() const;
+  const std::shared_ptr<phi::TensorBase>& impl() const;
 
   /**
    * @brief Set the implemention of current Tensor.
@@ -333,6 +356,13 @@ class PADDLE_API Tensor final {
    */
   void set_impl(const std::shared_ptr<phi::TensorBase>& impl);
 
+  /**
+   * @brief Set the implemention of current Tensor.
+   *
+   * @param impl
+   */
+  void set_impl(std::shared_ptr<phi::TensorBase>&& impl);
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   /**
    * @brief Get the stream where the tensor is currently located
@@ -384,11 +414,11 @@ class PADDLE_API Tensor final {
   /**
    * @brief Transfer the current Tensor to the specified device and return.
    *
-   * @param backend, The target backend of which the tensor will copy to.
+   * @param place, The target place of which the tensor will copy to.
    * @param blocking, Should we copy this in sync way.
    * @return Tensor
    */
-  Tensor copy_to(Backend backend, bool blocking) const;
+  Tensor copy_to(Place place, bool blocking) const;
 
   /**
    * @brief Transfer the source Tensor to current Tensor.
@@ -397,7 +427,9 @@ class PADDLE_API Tensor final {
    * @param blocking, Should we copy this in sync way.
    * @return void
    */
-  void copy_(const Tensor& src, const bool blocking);
+  void copy_(const Tensor& src,
+             const phi::Place& target_place,
+             const bool blocking);
   /**
    * @brief Cast datatype from one to another
    *
@@ -472,7 +504,21 @@ class PADDLE_API Tensor final {
    */
   void set_autograd_meta(std::shared_ptr<AbstractAutogradMeta> autograd_meta);
 
-  /* Part 9: Auto generated Tensor methods */
+  /* Part 9: Inplace methods */
+
+  /**
+   * @brief Increase inplace version
+   */
+  void bump_inplace_version();
+
+  /**
+   * @brief Get current inplace version
+   *
+   * @return uint32_t
+   */
+  uint32_t current_inplace_version();
+
+  /* Part 10: Auto generated Tensor methods */
 
  private:
   /**
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 42bf7a8103f83..50c267f653564 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -135,8 +135,9 @@ add_custom_command(
 
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw)
 cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi)
+cc_library(context_pool SRCS context_pool.cc DEPS phi_context phi_enforce place)
 
-cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory)
+cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory context_pool)
 cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
 cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
 cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
@@ -148,4 +149,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph
 cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
 cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl)
 
-cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api)
+cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta)
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index fc1afb26bf414..98f28ddcbdb33 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
@@ -31,9 +32,10 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
+Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
+  kernel_key_set.backend_set =
+      kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place));
   auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       "copy", kernel_key);
@@ -57,8 +59,7 @@ Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
                                     phi::DenseTensor*);
 
   auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-  (*kernel_fn)(
-      *dev_ctx, *dense_x, phi::TransToPhiPlace(backend), blocking, kernel_out);
+  (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out);
 
   return out;
 }
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index 5acb68a328133..48eda2d954647 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -15,15 +15,14 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 
 namespace paddle {
 namespace experimental {
 
-// TODO(chenweihang): Replace backend by place when place is ready
-Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking);
+Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);
 
 std::vector<Tensor> split_impl(const Tensor& x,
                                const ScalarArray& num_or_sections,
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index e1ebe8c6465cf..0c11e2df65d0d 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -95,12 +95,8 @@ paddle::optional<phi::MetaTensor> MakeMetaTensor(
 /* ------------------ for output ----------------------- */
 
 phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
-  if (!out->initialized()) {
-    auto dense_tensor = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
-        phi::DenseTensorMeta());
-    out->set_impl(dense_tensor);
-    return dense_tensor.get();
+  if (out->impl() == nullptr) {
+    out->set_impl(std::make_shared<phi::DenseTensor>());
   }
   return static_cast<phi::DenseTensor*>(out->impl().get());
 }
@@ -111,9 +107,7 @@ std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
   out->reserve(out_size);
   std::vector<phi::DenseTensor*> results(out_size);
   for (size_t i = 0; i < out_size; ++i) {
-    auto tensor_ptr = std::make_shared<phi::DenseTensor>(
-        phi::make_intrusive<SharedStorage>(phi::TransToPhiPlace(backend)),
-        phi::DenseTensorMeta());
+    auto tensor_ptr = std::make_shared<phi::DenseTensor>();
     results[i] = tensor_ptr.get();
     out->emplace_back();
     out->back().set_impl(tensor_ptr);
diff --git a/paddle/phi/api/lib/backend_set.h b/paddle/phi/api/lib/backend_set.h
index 88f7b086715d6..2aa4f969221d9 100644
--- a/paddle/phi/api/lib/backend_set.h
+++ b/paddle/phi/api/lib/backend_set.h
@@ -35,7 +35,7 @@ class BackendSet final {
       : bitset_(b == Backend::UNDEFINED ? 0 : 1ULL << (static_cast<uint8_t>(b) -
                                                        1)) {}
 
-  uint64_t bitset() const { return bitset_; }
+  inline uint64_t bitset() const { return bitset_; }
 
   bool inline Has(Backend b) const {
     PD_CHECK(b != Backend::UNDEFINED, "Backend argument can't be UNDEFINED.");
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
new file mode 100644
index 0000000000000..07ac9822d3310
--- /dev/null
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/api/include/context_pool.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace experimental {
+
+DeviceContextPool& DeviceContextPool::Instance() {
+  static DeviceContextPool g_device_context_pool;
+  return g_device_context_pool;
+}
+
+const phi::DeviceContext* DeviceContextPool::Get(const Place& place) {
+  auto it = context_map_.find(place);
+  if (it == context_map_.end()) {
+    // only when we need the specific DeviceContext, get and cache it
+    auto* dev_ctx = paddle::platform::DeviceContextPool::Instance().Get(place);
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      context_map_[place] = dev_ctx;
+    }
+    return dev_ctx;
+  }
+  return it->second;
+}
+
+phi::DeviceContext* DeviceContextPool::GetMutable(const Place& place) {
+  return const_cast<phi::DeviceContext*>(Get(place));
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 79b8ac6d0b835..7d886e50dbc23 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -39,7 +39,7 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input,
                                const TransformFlag& transform_flag) {
   bool ret = transform_flag.need_trans_backend() &&
              target != Backend::ALL_BACKEND &&
-             !platform::is_same_place(input, phi::TransToPhiPlace(target));
+             phi::TransToPhiBackend(input) != target;
   return ret;
 }
 
@@ -167,10 +167,7 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor,
 
   if (NeedTransformPlace(
           out.place(), target_args_def.backend, transform_flag)) {
-    phi::DenseTensor result(
-        phi::make_intrusive<paddle::experimental::SharedStorage>(
-            phi::TransToPhiPlace(target_args_def.backend)),
-        {out.dtype(), out.dims(), out.layout()});
+    phi::DenseTensor result;
     framework::TransDataDevice(
         out, phi::TransToPhiPlace(target_args_def.backend), &result);
     out = result;
@@ -183,21 +180,21 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag) {
   const auto& tensor_in = input.impl();
-  if (!transform_flag.NeedTransform() || !tensor_in->initialized() ||
+  phi::DenseTensor& dense_tensor =
+      *static_cast<phi::DenseTensor*>(tensor_in.get());
+  if (!transform_flag.NeedTransform() || !dense_tensor.initialized() ||
       (!NeedTransformPlace(
-           tensor_in->place(), target_args_def.backend, transform_flag) &&
+           dense_tensor.place(), target_args_def.backend, transform_flag) &&
        !NeedTransformDataType(
-           tensor_in->dtype(), target_args_def.dtype, transform_flag) &&
+           dense_tensor.dtype(), target_args_def.dtype, transform_flag) &&
        !NeedTransformLayout(
-           tensor_in->layout(), target_args_def.layout, transform_flag))) {
-    return std::dynamic_pointer_cast<phi::DenseTensor>(tensor_in);
+           dense_tensor.layout(), target_args_def.layout, transform_flag))) {
+    return std::static_pointer_cast<phi::DenseTensor>(tensor_in);
   }
 
   phi::DenseTensor out =
-      TransformData(*(static_cast<phi::DenseTensor*>(tensor_in.get())),
-                    target_args_def,
-                    transform_flag);
-  return std::make_shared<phi::DenseTensor>(out);
+      TransformData(dense_tensor, target_args_def, transform_flag);
+  return std::make_shared<phi::DenseTensor>(std::move(out));
 }
 
 std::shared_ptr<phi::DenseTensor> PrepareData(
diff --git a/paddle/phi/api/lib/data_type_set.h b/paddle/phi/api/lib/data_type_set.h
index ecc1b37c3a6af..4b5e6bde24700 100644
--- a/paddle/phi/api/lib/data_type_set.h
+++ b/paddle/phi/api/lib/data_type_set.h
@@ -30,7 +30,7 @@ class DataTypeSet final {
                     ? 0
                     : 1ULL << (static_cast<uint8_t>(dtype) - 1)) {}
 
-  uint64_t bitset() const { return bitset_; }
+  inline uint64_t bitset() const { return bitset_; }
 
   bool inline Has(DataType dtype) const {
     PD_CHECK(dtype != DataType::UNDEFINED,
diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc
index 0e3ca1af4967c..c2f7a7981f001 100644
--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -14,14 +14,18 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 
+#include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/core/compat/convert_utils.h"
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
 
 namespace paddle {
 namespace experimental {
 namespace detail {
 
-BackendSet GetTensorBackendSet(const Tensor& t) {
-  BackendSet backend_set(phi::TransToPhiBackend(t.inner_place()));
+BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
+  BackendSet backend_set(phi::TransToPhiBackend(t.place()));
   switch (t.layout()) {
     case DataLayout::MKLDNN:
       backend_set = backend_set | BackendSet(Backend::MKLDNN);
@@ -34,6 +38,11 @@ BackendSet GetTensorBackendSet(const Tensor& t) {
 }
 
 std::size_t CountLeadingZeros(uint64_t val) {
+#if defined(__clang__) || defined(__GNUC__)
+  return __builtin_clzl(val);
+#elif defined(_MSC_VER)
+  return __lzcnt64(val);
+#else
   if (val == 0) {
     return 64;
   }
@@ -47,13 +56,14 @@ std::size_t CountLeadingZeros(uint64_t val) {
     }
   }
   return zero_bits;
+#endif
 }
 
 }  // namespace detail
 
 phi::DeviceContext* GetDeviceContextByBackend(phi::Backend backend) {
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
-  return pool.Get(phi::TransToPhiPlace(backend));
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  return pool.GetMutable(phi::TransToPhiPlace(backend));
 }
 
 DataType ParseDataType(DataType dtype) { return dtype; }
@@ -81,13 +91,17 @@ DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor) {
   return dtype != DataType::UNDEFINED ? dtype : ParseDataType(tensor);
 }
 
-Backend ParseBackend(Backend backend) { return backend; }
+Backend ParseBackend(const Place& place) {
+  return phi::TransToPhiBackend(place);
+}
 Backend ParseBackend(const Tensor& tensor) {
   return phi::TransToPhiBackend(tensor.inner_place());
 }
 
-Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor) {
-  return backend != Backend::UNDEFINED ? backend : ParseBackend(tensor);
+Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) {
+  return place.GetType() != phi::AllocationType::UNDEFINED
+             ? ParseBackend(place)
+             : ParseBackend(tensor);
 }
 
 DataLayout ParseLayout(DataLayout layout) { return layout; }
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index 9a09bc2183ad7..25b74e7fe31b9 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -33,7 +33,7 @@ namespace paddle {
 namespace experimental {
 
 namespace detail {
-BackendSet GetTensorBackendSet(const Tensor& t);
+BackendSet GetTensorBackendSet(const phi::TensorBase& t);
 std::size_t CountLeadingZeros(uint64_t val);
 }  // namespace detail
 
@@ -93,11 +93,13 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
   // TODO(chenweihang): deal with multiple diff input Tensors
   // TODO(chenweihang): add global device guard method to set backend
   void operator()(const Tensor& x) {
-    key_set.backend_set = key_set.backend_set | detail::GetTensorBackendSet(x);
-    // TODO(chenweihang): selecte multi layout and dtype
-    key_set.layout = x.layout();
-    key_set.dtype = x.type();
-    dtype_set = dtype_set | DataTypeSet(x.dtype());
+    const phi::TensorBase& tensor = *x.impl();
+    key_set.backend_set =
+        key_set.backend_set | detail::GetTensorBackendSet(tensor);
+    // TODO(chenweihang): select multi layout and dtype
+    key_set.layout = tensor.layout();
+    key_set.dtype = tensor.dtype();
+    dtype_set = dtype_set | DataTypeSet(key_set.dtype);
     auto promote_result = PromoteTypes(dtype_set);
     if (promote_result != DataType::UNDEFINED) {
       key_set.dtype = promote_result;
@@ -105,11 +107,12 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
   }
 
   void operator()(const std::vector<Tensor>& x) {
+    const phi::TensorBase& tensor = *x.at(0).impl();
     key_set.backend_set =
-        key_set.backend_set | detail::GetTensorBackendSet(x[0]);
-    // TODO(chenweihang): selecte multi layout and dtype
-    key_set.layout = x[0].layout();
-    key_set.dtype = x[0].type();
+        key_set.backend_set | detail::GetTensorBackendSet(tensor);
+    // TODO(chenweihang): select multi layout and dtype
+    key_set.layout = tensor.layout();
+    key_set.dtype = tensor.dtype();
   }
 
   // skip other type args, these args don't used in kernel selection
@@ -154,7 +157,7 @@ DataType ParseDataType(const Tensor& tensor);
 DataType ParseDataType(const std::vector<Tensor>& tensors);
 DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor);
 
-Backend ParseBackend(Backend backend);
+Backend ParseBackend(const Place& place);
 Backend ParseBackend(const Tensor& tensor);
 template <typename T, typename... Args>
 Backend ParseBackend(T t, Args... args) {
@@ -163,7 +166,7 @@ Backend ParseBackend(T t, Args... args) {
   return static_cast<Backend>(64 -
                               detail::CountLeadingZeros(backend_set.bitset()));
 }
-Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor);
+Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor);
 
 DataLayout ParseLayout(DataLayout layout);
 DataLayout ParseLayout(const Tensor& tensor);
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
index 832c19361e5eb..8f8de02e49bdf 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.cc
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc
@@ -25,25 +25,24 @@ namespace paddle {
 namespace experimental {
 namespace sparse {
 
-Tensor to_sparse_coo_impl(const Tensor& x,
-                          Backend backend,
-                          const int64_t sparse_dim) {
+Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim) {
   if (x.layout() == phi::DataLayout::SPARSE_COO) {
     return x;
   }
+
   // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "dense_to_sparse_coo";
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     kernel_name = "sparse_csr_to_coo";
   }
 
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_name, kernel_key);
 
-  VLOG(6) << "to API kernel key: " << kernel_key;
+  VLOG(6) << "add API kernel key: " << kernel_key;
   VLOG(6) << "to API kernel: " << kernel;
 
   // 2. Get Device Context
@@ -62,18 +61,18 @@ Tensor to_sparse_coo_impl(const Tensor& x,
 
   // 4. InferMeta
   auto indices_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW);
-  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {-1}, x.layout());
+      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
+  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout());
 
   // 5. Prepare outputs
   // create empty SparseCooTensor
   phi::DenseTensor non_zero_indices(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(indices_meta));
   phi::DenseTensor non_zero_elements(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(elements_meta));
   auto coo = std::make_shared<phi::SparseCooTensor>(
       non_zero_indices, non_zero_elements, x.dims());
@@ -88,23 +87,23 @@ Tensor to_sparse_coo_impl(const Tensor& x,
   return out;
 }
 
-Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
+Tensor to_sparse_csr_impl(const Tensor& x) {
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     return x;
   }
   // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "dense_to_sparse_csr";
   if (x.layout() == phi::DataLayout::SPARSE_COO) {
     kernel_name = "sparse_coo_to_csr";
   }
 
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_name, kernel_key);
 
-  VLOG(6) << "to API kernel key: " << kernel_key;
+  VLOG(6) << "add API kernel key: " << kernel_key;
   VLOG(6) << "to API kernel: " << kernel;
 
   // 2. Get Device Context
@@ -122,24 +121,24 @@ Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
 
   // 4. InferMeta
   auto crows_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW);
+      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
   auto cols_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW);
-  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {-1}, x.layout());
+      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
+  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout());
 
   // 5. Prepare outputs
   // create empty SparseCooTensor
   phi::DenseTensor non_zero_crows(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(crows_meta));
   phi::DenseTensor non_zero_cols(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(cols_meta));
   phi::DenseTensor non_zero_elements(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(elements_meta));
   auto csr = std::make_shared<phi::SparseCsrTensor>(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
@@ -154,24 +153,25 @@ Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) {
   return out;
 }
 
-Tensor to_dense_impl(const Tensor& x, Backend backend) {
+Tensor to_dense_impl(const Tensor& x) {
   if (x.layout() != phi::DataLayout::SPARSE_CSR &&
       x.layout() != phi::DataLayout::SPARSE_COO) {
     return x;
   }
+
   // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   std::string kernel_name = "sparse_coo_to_dense";
   if (x.layout() == phi::DataLayout::SPARSE_CSR) {
     kernel_name = "sparse_csr_to_dense";
   }
 
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_name, kernel_key);
 
-  VLOG(6) << "to API kernel key: " << kernel_key;
+  VLOG(6) << "add API kernel key: " << kernel_key;
   VLOG(6) << "to API kernel: " << kernel;
 
   // 2. Get Device Context
@@ -194,7 +194,7 @@ Tensor to_dense_impl(const Tensor& x, Backend backend) {
   // create empty SparseCooTensor
   auto dense_out = std::make_shared<phi::DenseTensor>(
       phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
+          phi::TransToPhiPlace(kernel_key.backend())),
       std::move(dense_meta));
 
   kernel_context.EmplaceBackOutput(dense_out.get());
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.h b/paddle/phi/api/lib/sparse_api_custom_impl.h
index 293b2cfa3d334..6053d281f0ff1 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.h
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.h
@@ -21,13 +21,11 @@ namespace paddle {
 namespace experimental {
 namespace sparse {
 
-Tensor to_dense_impl(const Tensor& x, Backend backend);
+Tensor to_dense_impl(const Tensor& x);
 
-Tensor to_sparse_coo_impl(const Tensor& x,
-                          Backend backend,
-                          const int64_t sparse_dim);
+Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim);
 
-Tensor to_sparse_csr_impl(const Tensor& x, Backend backend);
+Tensor to_sparse_csr_impl(const Tensor& x);
 
 }  // namespace sparse
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 40174a505dcc9..b9b6ca36f673b 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -25,6 +25,8 @@ limitations under the License. */
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -46,6 +48,7 @@ limitations under the License. */
  * In the future, the necessary components will be moved to the this library,
  * or the corresponding components will be re-implemented.
  */
+
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
@@ -131,6 +134,12 @@ bool Tensor::is_dense_tensor() const {
 bool Tensor::is_selected_rows() const {
   return phi::SelectedRows::classof(impl_.get());
 }
+bool Tensor::is_sparse_coo_tensor() const {
+  return phi::SparseCooTensor::classof(impl_.get());
+}
+bool Tensor::is_sparse_csr_tensor() const {
+  return phi::SparseCsrTensor::classof(impl_.get());
+}
 /* Part 3: Device and Backend methods */
 
 PlaceType Tensor::place() const {
@@ -142,17 +151,26 @@ PlaceType Tensor::place() const {
 }
 
 paddle::platform::Place Tensor::inner_place() const {
-  return ConvertExtPlaceToInnerPlace(place());
+  PADDLE_ENFORCE_NOT_NULL(
+      impl_,
+      phi::errors::PermissionDenied(
+          "Null pointer error, the impl_ of Tensor should not be "
+          "Null when calling Tensor::inner_place()."));
+  return impl_->place();
 }
 
 bool Tensor::is_cpu() const {
   return paddle::platform::is_cpu_place(inner_place());
 }
 
-bool Tensor::is_cuda() const {
+bool Tensor::is_gpu() const {
   return paddle::platform::is_gpu_place(inner_place());
 }
 
+bool Tensor::is_gpu_pinned() const {
+  return paddle::platform::is_cuda_pinned_place(inner_place());
+}
+
 /* Part 4: Data Access methods */
 
 template <typename T>
@@ -286,12 +304,16 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const {
   }
 }
 
-std::shared_ptr<phi::TensorBase> Tensor::impl() const { return impl_; }
+const std::shared_ptr<phi::TensorBase> &Tensor::impl() const { return impl_; }
 
 void Tensor::set_impl(const std::shared_ptr<phi::TensorBase> &impl) {
   impl_ = impl;
 }
 
+void Tensor::set_impl(std::shared_ptr<phi::TensorBase> &&impl) {
+  impl_ = std::move(impl);
+}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 gpuStream_t Tensor::stream() const {
   return platform::stream::get_current_stream(-1)->raw_stream();
@@ -337,5 +359,36 @@ void Tensor::set_autograd_meta(
   autograd_meta_ = std::move(autograd_meta);
 }
 
+void Tensor::bump_inplace_version() {
+  if (is_dense_tensor()) {
+    auto &inplace_version_counter =
+        std::dynamic_pointer_cast<phi::DenseTensor>(impl_)
+            ->InplaceVersionCounter();
+    VLOG(3) << "yoki: before bump inplace version: "
+            << inplace_version_counter.CurrentVersion();
+    inplace_version_counter.Bump();
+    VLOG(3) << "yoki: after bump inplace version: "
+            << inplace_version_counter.CurrentVersion();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "bump_inplace_version is only supported on DenseTensor now."));
+  }
+}
+
+uint32_t Tensor::current_inplace_version() {
+  if (is_dense_tensor()) {
+    auto &inplace_version_counter =
+        std::dynamic_pointer_cast<phi::DenseTensor>(impl_)
+            ->InplaceVersionCounter();
+    VLOG(3) << "yoki: print version: "
+            << inplace_version_counter.CurrentVersion();
+    return inplace_version_counter.CurrentVersion();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "current_inplace_version is only supported on DenseTensor now."));
+  }
+  return 0;
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index 885e29b27fa8e..c6214052f7bc3 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -19,19 +19,22 @@ limitations under the License. */
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/tensor_base.h"
 
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace experimental {
-
 // declare cast api
 Tensor cast(const Tensor &x, DataType out_dtype);
-Tensor copy_to(const Tensor &x, Backend backend, bool blocking);
+Tensor copy_to(const Tensor &x, Place place, bool blocking);
 
 Tensor Tensor::cast(DataType target_type) const {
   return experimental::cast(*this, target_type);
 }
 
-Tensor Tensor::copy_to(Backend backend, bool blocking) const {
-  return experimental::copy_to(*this, backend, blocking);
+Tensor Tensor::copy_to(Place place, bool blocking) const {
+  return experimental::copy_to(*this, place, blocking);
 }
 
 template <typename T>
@@ -41,7 +44,7 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
                   "`copy_to` method without template argument instead. "
                   "reason: copying a Tensor to another device does not need "
                   "to specify the data type template argument.";
-  return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false);
+  return copy_to(ConvertExtPlaceToInnerPlace(target_place), /*blocking=*/false);
 }
 
 template PADDLE_API Tensor
@@ -67,12 +70,18 @@ template PADDLE_API Tensor Tensor::copy_to<phi::dtype::complex<double>>(
 template PADDLE_API Tensor
 Tensor::copy_to<phi::dtype::float16>(const PlaceType &target_place) const;
 
-void Tensor::copy_(const Tensor &src, bool blocking) {
+void Tensor::copy_(const Tensor &src,
+                   const phi::Place &target_place,
+                   bool blocking) {
   if (!src.is_initialized()) {
+    VLOG(8) << "Src is empty, skip copy";
     return;
   }
+  // Prepare copy kernel key and outputs
+  auto kernel_key_set = ParseKernelKeyByInputArgs(src);
+  KernelType kernel_type = ParseKernelTypeByInputArgs(src);
   VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name();
-  if (defined()) {
+  if (is_initialized()) {
     PADDLE_ENFORCE_EQ(dtype(),
                       src.dtype(),
                       platform::errors::PreconditionNotMet(
@@ -87,10 +96,91 @@ void Tensor::copy_(const Tensor &src, bool blocking) {
                           "Copy cannot be performed!",
                           name(),
                           src.name()));
+    PADDLE_ENFORCE_EQ(target_place,
+                      inner_place(),
+                      platform::errors::PreconditionNotMet(
+                          "Place is different of dst tensor and args %s, which "
+                          "current tensor holds %s "
+                          "Copy cannot be performed!",
+                          target_place.DebugString(),
+                          inner_place().DebugString()));
+    kernel_key_set.backend_set =
+        kernel_key_set.backend_set |
+        BackendSet(phi::TransToPhiBackend(inner_place()));
+  } else {
+    // Deep Copy AutoGrad info from src to self.
+    *autograd_meta_ = *(src.autograd_meta_);
+  }
+
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  auto *dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  if (kernel_type == KernelType::DENSE_TENSOR_KENREL) {
+    auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+        "copy", {kernel_backend, kernel_layout, kernel_data_type});
+    VLOG(6) << "copy API kernel key: " << kernel_key;
+    VLOG(6) << "copy API kernel: " << kernel;
+    using kernel_signature = void (*)(const platform::DeviceContext &,
+                                      const phi::DenseTensor &,
+                                      phi::Place,
+                                      bool,
+                                      phi::DenseTensor *);
+    SetKernelOutput(kernel_backend, this);
+    phi::MetaTensor meta_out(impl_.get());
+    phi::UnchangedInferMeta(
+        MakeMetaTensor(
+            *(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
+        &meta_out);
+    auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    (*kernel_fn)(*dev_ctx,
+                 (*(std::static_pointer_cast<phi::DenseTensor>(src.impl_))),
+                 target_place,
+                 blocking,
+                 static_cast<phi::DenseTensor *>(impl_.get()));
+  } else if (kernel_type == KernelType::SELECTED_ROWS_KENREL) {
+    auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+        "copy_sr", {kernel_backend, kernel_layout, kernel_data_type});
+    VLOG(6) << "copy API kernel key: " << kernel_key;
+    VLOG(6) << "copy API kernel: " << kernel;
+    using kernel_signature = void (*)(const platform::DeviceContext &,
+                                      const phi::SelectedRows &,
+                                      phi::Place,
+                                      bool,
+                                      phi::SelectedRows *);
+    SetSelectedRowsKernelOutput(kernel_backend, this);
+    phi::MetaTensor meta_out(impl_.get());
+    phi::UnchangedInferMeta(
+        MakeMetaTensor(
+            *(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
+        &meta_out);
+    auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    (*kernel_fn)(*dev_ctx,
+                 (*(std::static_pointer_cast<phi::SelectedRows>(src.impl_))),
+                 target_place,
+                 blocking,
+                 static_cast<phi::SelectedRows *>(impl_.get()));
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "We currently only support dense tensor copy for now and if u need to "
+        "copy selected rows please raise a issue."));
   }
-  auto copy_tensor =
-      src.copy_to(phi::TransToPhiBackend(src.inner_place()), blocking);
-  set_impl(copy_tensor.impl());
 }
 
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index 6d056b54b7005..271a58222f0c0 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
-tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits)
+tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar)
diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h
index bbd4966b7274f..6315fe15afdf1 100644
--- a/paddle/phi/backends/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -523,6 +523,15 @@ struct CustomRuntimeParams {
   char reserved[32];
 };
 
+#define PADDLE_CUSTOM_RUNTIME_CHECK_VERSION(params)             \
+  if ((params)->size != sizeof(DevicePluginParams) &&           \
+      (params)->interface->size != sizeof(C_DeviceInterface)) { \
+    return;                                                     \
+  }                                                             \
+  (params)->version.major = PADDLE_DEVICE_PLUGIN_MAJOR_VERSION; \
+  (params)->version.minor = PADDLE_DEVICE_PLUGIN_MINOR_VERSION; \
+  (params)->version.patch = PADDLE_DEVICE_PLUGIN_PATCH_VERSION;
+
 // Plugin implement it and fill CustomRuntimeParams
 void InitPlugin(CustomRuntimeParams*);
 
diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc
index 1ffe38d8e1f4c..35339aed0f3e1 100644
--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -393,6 +393,11 @@ DeviceManager& DeviceManager::Instance() {
   return platform_manager;
 }
 
+void DeviceManager::Clear() {
+  Instance().device_map_.clear();
+  Instance().device_impl_map_.clear();
+}
+
 std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
   std::vector<std::string> libraries;
   std::regex express(".*\\.so");
diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h
index c0911a0f8d50c..39eef27b4a607 100644
--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -158,6 +158,8 @@ class DeviceManager {
 
   static std::vector<size_t> GetDeviceList(const std::string& device_type);
 
+  static void Clear();
+
  private:
   DISABLE_COPY_AND_ASSIGN(DeviceManager);
   DeviceManager() {}
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index a3b252598582b..0394835aa8b70 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -741,6 +741,10 @@ struct GPUContext::Impl {
 
 GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
 
+GPUContext::GPUContext(GPUContext&&) = default;
+
+GPUContext& GPUContext::operator=(GPUContext&&) = default;
+
 GPUContext::GPUContext(const GPUPlace& place)
     : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
 
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 3eb4360ad3538..cd08da1c0f2f8 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -77,6 +77,8 @@ class DnnWorkspaceHandle {
 class GPUContext : public DeviceContext {
  public:
   GPUContext();
+  GPUContext(GPUContext&&);
+  GPUContext& operator=(GPUContext&&);
 
   explicit GPUContext(const GPUPlace& place);
 
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index 85a1424ee34e0..9bf692703860f 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1 +1,2 @@
 cc_library(phi_place SRCS place.cc)
+cc_library(scalar SRCS scalar.cc DEPS phi_enforce)
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index 644bf3679af2a..2b5254d3d5f14 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -92,4 +92,20 @@ std::string GetGlobalDeviceType(size_t device_type_id) {
   return global_registered_device_type[device_type_id];
 }
 
+constexpr static int kAllocationTypeBitLength = 8;
+constexpr static int kDeviceTypeIDBitLength = 8;
+constexpr static int kDeviceIDBitLength = 8;
+
+uint32_t Place::Hash::operator()(const Place &place) const {
+  uint32_t hash_value = 0;
+  // |----31-24------|-----23-16------|-----15-08----|---7-0----|
+  // | For extension | AllocationType | DeviceTypeID | DeviceID |
+  hash_value |= (static_cast<uint8_t>(place.alloc_type_)
+                 << (kDeviceIDBitLength + kDeviceTypeIDBitLength));
+  hash_value |=
+      (static_cast<uint8_t>(place.device_type_id_) << kDeviceIDBitLength);
+  hash_value |= static_cast<uint8_t>(place.device);
+  return hash_value;
+}
+
 }  // namespace phi
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index 36fb910cad6c7..4c6d47597bd2c 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -73,31 +73,23 @@ class Place {
 
   std::string DebugString() const;
 
+  struct Hash {
+    // Note: Now the number of bits we need does not exceed 32 bits, so there is
+    // no need to use 64 bits. If needed in the future, it can be expanded,
+    // but now we don’t over-design.
+    uint32_t operator()(const Place& place) const;
+  };
+
+  uint32_t HashValue() const { return Hash()(*this); }
+
   inline bool operator==(const Place& rhs) const {
-    if (alloc_type_ != rhs.GetType()) {
-      return false;
-    }
-    if (alloc_type_ == AllocationType::CPU ||
-        alloc_type_ == AllocationType::GPUPINNED ||
-        alloc_type_ == AllocationType::NPUPINNED) {
-      return true;
-    }
-    if (alloc_type_ == AllocationType::CUSTOM) {
-      return device_type_id_ == rhs.device_type_id_ &&
-             device == rhs.GetDeviceId();
-    }
-    return device == rhs.GetDeviceId();
+    return HashValue() == rhs.HashValue();
+  }
+  inline bool operator!=(const Place& rhs) const {
+    return HashValue() != rhs.HashValue();
   }
-  inline bool operator!=(const Place& rhs) const { return !(*this == rhs); }
   inline bool operator<(const Place& rhs) const {
-    if (alloc_type_ != rhs.GetType()) {
-      return static_cast<int>(alloc_type_) < static_cast<int>(rhs.GetType());
-    }
-    if (alloc_type_ == AllocationType::CUSTOM &&
-        device_type_id_ != rhs.device_type_id_) {
-      return device_type_id_ < rhs.device_type_id_;
-    }
-    return device < rhs.GetDeviceId();
+    return HashValue() < rhs.HashValue();
   }
 
  public:
@@ -206,3 +198,15 @@ class CustomPlace : public Place {
 std::ostream& operator<<(std::ostream&, const Place&);
 
 }  // namespace phi
+
+namespace paddle {
+namespace experimental {
+using AllocationType = phi::AllocationType;
+using Place = phi::Place;
+using CPUPlace = phi::CPUPlace;
+using GPUPlace = phi::GPUPlace;
+using GPUPinnedPlace = phi::GPUPinnedPlace;
+using XPUPlace = phi::XPUPlace;
+using NPUPlace = phi::NPUPlace;
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc
new file mode 100644
index 0000000000000..5cd55c1e88bed
--- /dev/null
+++ b/paddle/phi/common/scalar.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/common/scalar.h"
+
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace experimental {
+
+// NOTE(xiongkun): why we put definition here?
+// test_custom_op can't include enforce.h, because enforce.h includes gflags.
+// so we decouple the include dependence of enforce.h by link.
+void ThrowTensorConvertError(int num) {
+  PADDLE_ENFORCE_EQ(num,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The Scalar only supports Tensor with 1 element, but "
+                        "now Tensor has `%d` elements",
+                        num));
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index 72cef89d300c8..5134f4eb72639 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -19,9 +19,12 @@ limitations under the License. */
 
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/api/include/tensor.h"
+
 namespace paddle {
 namespace experimental {
 
+void ThrowTensorConvertError(int);
+
 template <typename T>
 class ScalarBase {
  public:
@@ -104,11 +107,7 @@ class ScalarBase {
   // The Tensor must have one dim
   ScalarBase(const T& tensor) : dtype_(tensor.dtype()) {  // NOLINT
     is_from_tensor_ = true;
-    PD_CHECK(
-        tensor.numel() == 1,
-        "The Scalar only supports Tensor with 1 element, but now Tensor has `",
-        tensor.numel(),
-        "` element.");
+    ThrowTensorConvertError(tensor.numel());
     switch (dtype_) {
       case DataType::FLOAT32:
         data_.f32 = tensor.template data<float>()[0];
@@ -156,6 +155,8 @@ class ScalarBase {
     CopyScalar(other, this);
   }
 
+  // NOTE(xiongkun): some op need to judge the dtype of the Scalar, we expose a
+  // interface.
   bool FromTensor() const { return is_from_tensor_; }
 
   void SetFromTensor(bool from_tensor) { is_from_tensor_ = from_tensor; }
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index 25b80279ecf10..71cec01141164 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -89,6 +89,8 @@ class ArgumentMappingContext {
 
   virtual bool IsDenseTensorInput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
+  // For compatibility with LoDTensorArray
+  virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;
 
   virtual bool IsDenseTensorOutput(const std::string& name) const = 0;
   virtual bool IsSelectedRowsOutput(const std::string& name) const = 0;
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 67245f1da5a6b..667cee10675d8 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -26,13 +26,14 @@ limitations under the License. */
 namespace phi {
 
 Backend TransToPhiBackend(const phi::Place& place) {
-  if (place.GetType() == phi::AllocationType::CPU) {
+  auto allocation_type = place.GetType();
+  if (allocation_type == phi::AllocationType::CPU) {
     return Backend::CPU;
-  } else if (place.GetType() == phi::AllocationType::GPU) {
+  } else if (allocation_type == phi::AllocationType::GPU) {
     return Backend::GPU;
-  } else if (place.GetType() == phi::AllocationType::XPU) {
+  } else if (allocation_type == phi::AllocationType::XPU) {
     return Backend::XPU;
-  } else if (place.GetType() == phi::AllocationType::CUSTOM) {
+  } else if (allocation_type == phi::AllocationType::CUSTOM) {
     return static_cast<Backend>(
         static_cast<size_t>(Backend::NUM_BACKENDS) +
         GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 00e9bff9bd591..613a2f9960a6f 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -42,12 +42,24 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "flatten_grad",
                                                            "isinf",
                                                            "isnan",
+                                                           "unsqueeze",
+                                                           "unsqueeze_grad",
+                                                           "squeeze",
+                                                           "squeeze_grad",
                                                            "isfinite",
                                                            "matmul",
                                                            "matmul_grad",
                                                            "matmul_grad_grad",
                                                            "mean",
+                                                           "mean_grad",
                                                            "max",
+                                                           "max_grad",
+                                                           "min",
+                                                           "min_grad",
+                                                           "prod",
+                                                           "prod_grad",
+                                                           "any",
+                                                           "all",
                                                            "reshape",
                                                            "reshape_grad",
                                                            "expand",
@@ -55,6 +67,7 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "expand_grad",
                                                            "expand_as_grad",
                                                            "sum",
+                                                           "one_hot",
                                                            "sum_grad",
                                                            "top_k",
                                                            "top_k_grad"});
diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc
index 7a0f50533360d..2e185fc0ca22b 100644
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -110,8 +110,9 @@ void* DenseTensor::AllocateFrom(Allocator* allocator,
 template <typename T>
 const T* DenseTensor::data() const {
   check_memory_size();
-  PADDLE_ENFORCE(
-      (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
+  PADDLE_ENFORCE_EQ(
+      dtype(),
+      paddle::experimental::CppTypeToDataType<T>::Type(),
       phi::errors::InvalidArgument(
           "The type of data we are trying to retrieve does not match the "
           "type of data currently contained in the container."));
diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc
index a32e0e44f4696..234e3528c363b 100644
--- a/paddle/phi/core/kernel_context.cc
+++ b/paddle/phi/core/kernel_context.cc
@@ -37,6 +37,13 @@ void KernelContext::EmplaceBackInputs(
                  std::make_move_iterator(inputs.end()));
 }
 
+void KernelContext::EmplaceBackInputsWithoutSetRange(
+    paddle::SmallVector<const TensorBase*> inputs) {
+  inputs_.insert(inputs_.end(),
+                 std::make_move_iterator(inputs.begin()),
+                 std::make_move_iterator(inputs.end()));
+}
+
 void KernelContext::EmplaceBackOutput(TensorBase* output) {
   int index = outputs_.size();
   outputs_.emplace_back(output);
@@ -59,6 +66,13 @@ void KernelContext::EmplaceBackOutputs(
                   std::make_move_iterator(outputs.end()));
 }
 
+void KernelContext::EmplaceBackOutputsWithoutSetRange(
+    paddle::SmallVector<TensorBase*> outputs) {
+  outputs_.insert(outputs_.end(),
+                  std::make_move_iterator(outputs.begin()),
+                  std::make_move_iterator(outputs.end()));
+}
+
 void KernelContext::EmplaceBackAttr(paddle::any attr) {
   attrs_.emplace_back(std::move(attr));
 }
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 213ac47d30bfd..d3ca1ffc61c42 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -52,12 +52,18 @@ class KernelContext {
 
   void EmplaceBackInputs(paddle::SmallVector<const TensorBase*> inputs);
 
+  void EmplaceBackInputsWithoutSetRange(
+      paddle::SmallVector<const TensorBase*> inputs);
+
   void EmplaceBackOutput(TensorBase* output);
 
   void EmplaceBackOutputWithoutSetRange(TensorBase* output);
 
   void EmplaceBackOutputs(paddle::SmallVector<TensorBase*> outputs);
 
+  void EmplaceBackOutputsWithoutSetRange(
+      paddle::SmallVector<TensorBase*> outputs);
+
   void EmplaceBackAttr(paddle::any attr);
 
   const std::pair<int, int>& InputRangeAt(size_t idx) const;
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index be91409762635..e502b9cb3e025 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -197,8 +197,16 @@ class Kernel {
 
   const KernelArgsDef& args_def() const { return args_def_; }
 
+  const TensorArgDef& InputAt(size_t idx) const {
+    return args_def_.input_defs().at(idx);
+  }
+
   TensorArgDef& InputAt(size_t idx) { return args_def_.input_defs().at(idx); }
 
+  const TensorArgDef& OutputAt(size_t idx) const {
+    return args_def_.output_defs().at(idx);
+  }
+
   TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); }
 
   bool IsValid() { return fn_ != nullptr; }
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index d9ed68593cd61..c3356eadcbd21 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -98,6 +98,28 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+      } else if (arg_type == std::type_index(typeid(const SparseCooTensor&))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 paddle::optional<const SparseCooTensor&>))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(const SparseCsrTensor&))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 paddle::optional<const SparseCsrTensor&>))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
@@ -114,6 +136,16 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
+      } else if (arg_type == std::type_index(typeid(SparseCooTensor*))) {
+        args_def->AppendOutput(default_key.backend(),
+                               default_tensor_layout,
+                               default_key.dtype(),
+                               arg_type);
+      } else if (arg_type == std::type_index(typeid(SparseCsrTensor*))) {
+        args_def->AppendOutput(default_key.backend(),
+                               default_tensor_layout,
+                               default_key.dtype(),
+                               arg_type);
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 38a6e09a61ef8..bcbb1a4835b9d 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -72,6 +72,10 @@ void MetaTensor::set_layout(DataLayout layout) {
 }
 
 void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
+  if (meta_tensor.lod().size() == 0) {
+    // no need share
+    return;
+  }
   if (phi::DenseTensor::classof(tensor_)) {
     DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))->lod =
         meta_tensor.lod();
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 37d1a234b5767..b680222f86350 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -64,6 +64,45 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x,
   }
 }
 
+void ConvTransposeGradInferMeta(const MetaTensor& x,
+                                const MetaTensor& filter,
+                                const MetaTensor& dout,
+                                const std::vector<int>& strides,
+                                const std::vector<int>& paddings,
+                                const std::vector<int>& output_padding,
+                                const std::vector<int>& output_size,
+                                const std::string& padding_algorithm,
+                                int groups,
+                                const std::vector<int>& dilations,
+                                const std::string& data_format,
+                                MetaTensor* dx,
+                                MetaTensor* dfilter) {
+  GeneralBinaryGradInferMeta(x, filter, dx, dfilter);
+}
+
+void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x,
+                                        const MetaTensor& filter,
+                                        const MetaTensor& dout,
+                                        const MetaTensor& ddx,
+                                        const MetaTensor& ddfilter,
+                                        const std::vector<int>& strides,
+                                        const std::vector<int>& paddings,
+                                        const std::vector<int>& output_padding,
+                                        const std::vector<int>& output_size,
+                                        const std::string& padding_algorithm,
+                                        int groups,
+                                        const std::vector<int>& dilations,
+                                        const std::string& data_format,
+                                        MetaTensor* dx,
+                                        MetaTensor* dfilter,
+                                        MetaTensor* ddout) {
+  GeneralBinaryGradInferMeta(x, filter, dx, dfilter);
+
+  if (ddout) {
+    ddout->share_meta(dout);
+  }
+}
+
 void GatherNdGradInferMeta(const MetaTensor& x,
                            const MetaTensor& index,
                            const MetaTensor& out_grad,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 06ee5a205d7b0..5c49a58a715a4 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -17,10 +17,17 @@ limitations under the License. */
 #include <tuple>
 
 #include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/multiary.h"
+#include "paddle/phi/infermeta/ternary.h"
 #include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
 
+// Common InferMeta Functions for backward operators.
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
+
 void BilinearTensorProductGradInferMeta(const MetaTensor& x,
                                         const MetaTensor& y,
                                         const MetaTensor& weight,
@@ -30,6 +37,37 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x,
                                         MetaTensor* dweight,
                                         MetaTensor* dbias);
 
+void ConvTransposeGradInferMeta(const MetaTensor& x,
+                                const MetaTensor& filter,
+                                const MetaTensor& dout,
+                                const std::vector<int>& strides,
+                                const std::vector<int>& paddings,
+                                const std::vector<int>& output_padding,
+                                const std::vector<int>& output_size,
+                                const std::string& padding_algorithm,
+                                int groups,
+                                const std::vector<int>& dilations,
+                                const std::string& data_format,
+                                MetaTensor* dx,
+                                MetaTensor* dfilter);
+
+void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x,
+                                        const MetaTensor& filter,
+                                        const MetaTensor& dout,
+                                        const MetaTensor& ddx,
+                                        const MetaTensor& ddfilter,
+                                        const std::vector<int>& strides,
+                                        const std::vector<int>& paddings,
+                                        const std::vector<int>& output_padding,
+                                        const std::vector<int>& output_size,
+                                        const std::string& padding_algorithm,
+                                        int groups,
+                                        const std::vector<int>& dilations,
+                                        const std::string& data_format,
+                                        MetaTensor* dx,
+                                        MetaTensor* dfilter,
+                                        MetaTensor* ddout);
+
 void GatherNdGradInferMeta(const MetaTensor& x,
                            const MetaTensor& index,
                            const MetaTensor& out_grad,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 2947661517e78..5221076f10daa 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -17,7 +17,10 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
 namespace phi {
@@ -72,6 +75,51 @@ void AllValueCompareInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::BOOL);
 }
 
+void KLDivInferMeta(const MetaTensor& x,
+                    const MetaTensor& label,
+                    const std::string& reduction,
+                    MetaTensor* out,
+                    MetaConfig config) {
+  auto dim_x = x.dims();
+  auto dim_target = label.dims();
+  PADDLE_ENFORCE_EQ(dim_x.size(),
+                    dim_target.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(X) rank and Input(Target) rank should be "
+                        "same, but received X rank(%d) != Target rank(%d)",
+                        dim_x.size(),
+                        dim_target.size()));
+  for (int i = 0; i < dim_x.size(); i++) {
+    if (config.is_runtime || (dim_x[i] > 0 && dim_target[i] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          dim_x[i],
+          dim_target[i],
+          phi::errors::InvalidArgument(
+              "Input(X) and Input(Target) should in same shape. but received "
+              "X dimension[%d](%d) != Target dimension[%d](%d)",
+              i,
+              dim_x[i],
+              i,
+              dim_target[i]));
+    }
+  }
+
+  auto reduction_valid = "mean" == reduction || "sum" == reduction ||
+                         "batchmean" == reduction || "none" == reduction;
+  PADDLE_ENFORCE_EQ(
+      reduction_valid,
+      true,
+      phi::errors::InvalidArgument(
+          "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'."));
+
+  if ("none" == reduction) {
+    out->set_dims(dim_x);
+  } else {
+    out->set_dims({1});
+  }
+  out->set_dtype(x.dtype());
+}
+
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   out->share_meta(x);
 }
@@ -264,6 +312,376 @@ void CompareAllInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::BOOL);
 }
 
+void ConvInferMeta(const MetaTensor& input,
+                   const MetaTensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings_t,
+                   const std::string& padding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations_t,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search,
+                   MetaTensor* out,
+                   MetaConfig config) {
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+  auto in_dims = input.dims();
+  auto filter_dims = filter.dims();
+  int dilation_size = dilations.size();
+  for (int i = 0; i < dilation_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        dilations[i],
+        0,
+        phi::errors::InvalidArgument(
+            "The dilation of Op(Conv) should be larget than 0, but received "
+            "dilation is %d.",
+            dilations[i]));
+  }
+  const bool channel_last = (config.is_run_mkldnn_kernel == false) &&
+                            (data_format == "NHWC" || data_format == "NDHWC");
+
+  PADDLE_ENFORCE_EQ(
+      in_dims.size() == 4 || in_dims.size() == 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The input of Op(Conv) should be a 4-D or 5-D Tensor. But "
+          "received: input's dimension is %u, input's shape is [%s].",
+          in_dims.size(),
+          in_dims));
+
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      filter_dims.size(),
+      phi::errors::InvalidArgument(
+          "The input's dimension and filter's dimension of "
+          "Op(Conv) should be equal. But received: the input's shape is [%s], "
+          "the input's dimension is %d; the filter's shape is [%s],  "
+          "the filter's dimension is %d.",
+          in_dims,
+          in_dims.size(),
+          filter_dims,
+          filter_dims.size()));
+
+  int stride_size = strides.size();
+  for (int i = 0; i < stride_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        strides[i],
+        0,
+        phi::errors::InvalidArgument(
+            "The stride of Op(Conv) should be larget than 0, but received "
+            "stride is %d.",
+            strides[i]));
+  }
+
+  int in_sub_stride_size = in_dims.size() - stride_size;
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      strides.size() + 2U,
+      phi::errors::InvalidArgument(
+          "The difference of input's dimension and Attr(strides)'s "
+          "length must be euqal to 2 for Op(Conv). "
+          "But received: input's dimension is %d, input's shape is [%s]; "
+          "Attr(stride)'s length is %d, Attr(stride) is [%s]; "
+          "difference of input's dimention and Attr(strides)'s length = %u.",
+          in_dims.size(),
+          in_dims,
+          strides.size(),
+          phi::make_ddim(strides),
+          in_sub_stride_size));
+
+  const auto input_channels =
+      channel_last ? in_dims[in_dims.size() - 1] : in_dims[1];
+
+  PADDLE_ENFORCE_EQ(
+      input_channels,
+      filter_dims[1] * groups,
+      phi::errors::InvalidArgument(
+          "The number of input's channels should be equal to filter's channels "
+          "* groups for Op(Conv). But received: the input's channels is %d, "
+          "the input's shape is [%s]; the filter's channels is %d, the "
+          "filter's shape is [%s]; the groups is %d, the data_format is %s. "
+          "The error may come from wrong data_format setting.",
+          input_channels,
+          in_dims,
+          filter_dims[1],
+          filter_dims,
+          groups,
+          data_format));
+  PADDLE_ENFORCE_EQ(
+      filter_dims[0] % groups,
+      0,
+      phi::errors::InvalidArgument(
+          "The number of output's channels (filter's first dimension) of "
+          "Op(Conv) should be divided by groups. But received: "
+          "the output channels is %d, the filter's shape is [%s], "
+          "the groups is %d.",
+          filter_dims[0],
+          filter_dims,
+          groups));
+
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_GT(
+        filter_dims[0],
+        0,
+        phi::errors::InvalidArgument(
+            "the size of filter at axis 0 should be greater than 0"));
+  }
+
+  DDim in_data_dims;
+  if (channel_last) {
+    in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+  } else {
+    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+  }
+
+  DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
+
+  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+  phi::UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  std::vector<int64_t> output_shape({in_dims[0]});
+  if (!channel_last) {
+    output_shape.push_back(filter_dims[0]);
+  }
+  for (int i = 0; i < in_data_dims.size(); ++i) {
+    if ((!config.is_runtime) &&
+        (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) {
+      output_shape.push_back(-1);
+    } else {
+      const int dkernel = dilations[i] * (filter_data_dims[i] - 1) + 1;
+      int output_size =
+          (in_data_dims[i] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
+              strides[i] +
+          1;
+      output_shape.push_back(output_size);
+    }
+  }
+  if (channel_last) {
+    output_shape.push_back(filter_dims[0]);
+  }
+
+  out->set_dims(make_ddim(output_shape));
+  out->set_dtype(input.dtype());
+}
+
+void ConvInferInferMeta(const MetaTensor& input,
+                        const MetaTensor& filter,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        const std::string& paddding_algorithm,
+                        int groups,
+                        const std::vector<int>& dilations,
+                        const std::string& data_format,
+                        MetaTensor* out,
+                        MetaConfig config) {
+  ConvInferMeta(input,
+                filter,
+                strides,
+                paddings,
+                paddding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                /*use_addto=*/false,
+                /*workspace_size_MB=*/512,  // useless in infermeta
+                /*exhaustive_search=*/false,
+                out,
+                config);
+}
+
+void ConvTransposeInferMeta(const MetaTensor& x,
+                            const MetaTensor& filter,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            const std::vector<int>& output_padding,
+                            const std::vector<int>& output_size,
+                            const std::string& padding_algorithm,
+                            int groups,
+                            const std::vector<int>& dilations,
+                            const std::string& data_format,
+                            MetaTensor* out,
+                            MetaConfig config) {
+  auto x_dims = x.dims();
+  auto filter_dims = filter.dims();
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  const DataLayout data_layout =
+      config.is_run_mkldnn_kernel
+          ? DataLayout::kNCHW
+          : paddle::framework::StringToDataLayout(data_format);
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() == 4 || x_dims.size() == 5,
+      true,
+      errors::InvalidArgument("Input of Op(conv_transpose) should be 4-D or "
+                              "5-D Tensor. But received: %u-D Tensor, "
+                              "the shape of input is [%s]",
+                              x_dims.size(),
+                              x_dims));
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      filter_dims.size(),
+      errors::InvalidArgument(
+          "The input's dimension size and filter's dimension size of "
+          "Op (conv_transpose) should be equal. But received: the shape of "
+          "input is [%s], the dimension size of input is [%d], the shape "
+          "of filter is [%s],  the dimension size of filter is [%d]. ",
+          x_dims,
+          x_dims.size(),
+          filter_dims,
+          filter_dims.size()));
+
+  int stride_size = strides.size();
+  for (int i = 0; i < stride_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        strides[i],
+        0,
+        errors::InvalidArgument(
+            "The stride of Op(Conv) should be larget than 0, but received "
+            "stride is %d.",
+            strides[i]));
+  }
+
+  int in_sub_stride_size = x_dims.size() - stride_size;
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() - strides.size(),
+      2U,
+      errors::InvalidArgument(
+          "The input's dimension size minus Attr(stride)'s size must "
+          "be euqal to 2 for Op(conv_transpose). But received: [%d], the "
+          "input's dimension size is [%d], the shape of input "
+          "is [%s], the Attr(stride)'s size is [%d].",
+          in_sub_stride_size,
+          x_dims.size(),
+          x_dims,
+          strides.size()));
+  if (output_size.size())
+    PADDLE_ENFORCE_EQ(
+        output_size.size(),
+        strides.size(),
+        errors::InvalidArgument(
+            "The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
+            "should be the same."));
+  if (output_padding.size())
+    PADDLE_ENFORCE_EQ(
+        output_padding.size(),
+        strides.size(),
+        errors::InvalidArgument(
+            "The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
+            "should be the same."));
+
+  const int64_t C =
+      (data_layout != DataLayout::kNHWC ? x_dims[1]
+                                        : x_dims[x_dims.size() - 1]);
+  PADDLE_ENFORCE_EQ(
+      C,
+      filter_dims[0],
+      errors::InvalidArgument(
+          "The number of input channels should be equal to filter channels "
+          "for Op(conv_transpose). But received: the input's channels is "
+          "[%d], the shape of input is [%s], the filter's channels is [%d], "
+          "the shape of filter is [%s]. The data_format is %s."
+          "The error may come from wrong data_format setting.",
+          C,
+          x_dims,
+          filter_dims[0],
+          filter_dims,
+          data_format));
+
+  DDim x_data_dims;
+  if (data_layout != DataLayout::kNHWC) {
+    x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  } else {
+    x_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  std::vector<int64_t> output_shape({x_dims[0]});
+  if (data_layout != DataLayout::kNHWC) {
+    output_shape.push_back(filter_dims[1] * groups);
+  }
+  const int offset = (data_layout != DataLayout::kNHWC ? 2 : 1);
+  for (size_t i = 0; i < strides.size(); ++i) {
+    auto filter_extent = dilations_[i] * (filter_dims[i + 2] - 1) + 1;
+    auto infer_shape = (config.is_runtime || x_dims[i + offset] > 0)
+                           ? (x_dims[i + offset] - 1) * strides[i] -
+                                 paddings_[2 * i] - paddings_[2 * i + 1] +
+                                 filter_extent
+                           : -1;
+    if (output_size.size()) {
+      if (config.is_runtime) {
+        PADDLE_ENFORCE_GE(
+            output_size[i],
+            infer_shape,
+            errors::InvalidArgument(
+                "output_size of Op(ConvTransposeOp) should not be "
+                "less than the infered output size. But received output_size = "
+                "[%s], whose dim %d is less than the infered output size [%s]",
+                make_ddim(output_size).to_str(),
+                i,
+                infer_shape));
+        PADDLE_ENFORCE_LT(
+            output_size[i],
+            infer_shape + strides[i],
+            errors::InvalidArgument(
+                "output_size of Op(ConvTransposeOp) should be less "
+                "than infered size + stride. But received output_size = [%s], "
+                "whose dim %d is not less than the infered output size (%d) + "
+                "stride (%d) = %d",
+                make_ddim(output_size).to_str(),
+                i,
+                infer_shape,
+                strides[i],
+                infer_shape + strides[i]));
+      }
+      output_shape.push_back(output_size[i]);
+    } else if (output_padding.size()) {
+      if (config.is_runtime) {
+        PADDLE_ENFORCE_GE(
+            output_padding[i],
+            0,
+            errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should not be "
+                "less than the 0. But received output_padding = "
+                "[%s], whose dim %d is less than 0",
+                make_ddim(output_padding).to_str(),
+                i));
+        PADDLE_ENFORCE_LT(
+            output_padding[i],
+            std::max(strides[i], dilations_[i]),
+            errors::InvalidArgument(
+                "output_padding of Op(ConvTransposeOp) should be less "
+                "than either stride or dilation. But received output_size = "
+                "[%s], "
+                "whose dim %d is not less than either stride (%d)  or "
+                "dilation (%d)",
+                make_ddim(output_size).to_str(),
+                i,
+                strides[i],
+                dilations_[i]));
+      }
+      output_shape.push_back((infer_shape + output_padding[i]));
+    } else {
+      output_shape.push_back(infer_shape);
+    }
+  }
+  if (data_layout == DataLayout::kNHWC) {
+    output_shape.push_back(filter_dims[1] * groups);
+  }
+
+  out->set_dims(make_ddim(output_shape));
+  out->set_dtype(x.dtype());
+}
+
 void CrossInferMeta(const MetaTensor& x,
                     const MetaTensor& y,
                     int axis,
@@ -430,6 +848,82 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void ExpandAsInferMeta(const MetaTensor& x,
+                       paddle::optional<const MetaTensor&> y,
+                       const std::vector<int>& target_shape,
+                       MetaTensor* out) {
+#define MAX_RANK_SUPPORTED 6
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      target_shape.size(),
+      static_cast<size_t>(x_dims.size()),
+      phi::errors::InvalidArgument(
+          "The rank of target_shape must be greater than or equal "
+          "to the rank of Input(X). But received Input(X): input "
+          "rank %u; received target_shape: rank %u.",
+          x_dims.size(),
+          target_shape.size()));
+  PADDLE_ENFORCE_LE(target_shape.size(),
+                    MAX_RANK_SUPPORTED,
+                    phi::errors::InvalidArgument(
+                        "The rank of target_shape must be less than or equal "
+                        "to %d. But received: rank %u.",
+                        MAX_RANK_SUPPORTED,
+                        target_shape.size()));
+  out->set_dims(phi::make_ddim(target_shape));
+  out->set_dtype(x.dtype());
+#undef MAX_RANK_SUPPORTED
+}
+
+void GatherInferMeta(const MetaTensor& x,
+                     const MetaTensor& index,
+                     const Scalar& axis,
+                     MetaTensor* out) {
+  auto index_dims = index.dims();
+
+  if (index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(
+        index_dims[1],
+        1,
+        phi::errors::InvalidArgument(
+            "The last dim of index should be 1 when it is 2D, but we get %d",
+            index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The index should be 1D, when it is not 2D, but we get %d",
+            index_dims.size()));
+  }
+
+  auto input_dim = x.dims();
+  auto axis_v = axis.to<int>();
+  if (axis.FromTensor() || axis_v == 0) {
+    // if axis.FromTensor(), we can not obtain correct shape of output
+    int batch_size = index_dims[0];
+    phi::DDim output_dims(input_dim);
+    output_dims[0] = batch_size;
+    out->set_dims(output_dims);
+    out->set_dtype(x.dtype());
+    out->share_lod(x);
+  } else {
+    int index_size = index_dims[0];
+    std::vector<int> out_dim_vec;
+    for (int i = 0; i < axis_v; i++) {
+      out_dim_vec.push_back(input_dim[i]);
+    }
+    out_dim_vec.push_back(index_size);
+    for (int i = axis_v + 1; i < input_dim.size(); i++) {
+      out_dim_vec.push_back(input_dim[i]);
+    }
+    auto output_dims = phi::make_ddim(out_dim_vec);
+    out->set_dims(output_dims);
+    out->set_dtype(x.dtype());
+    out->share_lod(x);
+  }
+}
+
 void GatherNdInferMeta(const MetaTensor& x,
                        const MetaTensor& index,
                        MetaTensor* out) {
@@ -476,6 +970,48 @@ void GatherTreeMeta(const MetaTensor& ids,
   out->set_dims(ids_dims);
 }
 
+void GridSampleBaseInferMeta(const MetaTensor& x,
+                             const MetaTensor& grid,
+                             MetaTensor* out,
+                             MetaConfig config) {
+  auto x_dims = x.dims();
+  auto grid_dims = grid.dims();
+  PADDLE_ENFORCE_EQ(x_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input(X) of GridSampleOp should be 4-D Tensor, but "
+                        "received X dimension size(%d)",
+                        x_dims.size()));
+  PADDLE_ENFORCE_EQ(grid_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "Input(Grid) of GridSampleOp should be 4-D Tensor, "
+                        "but received X dimension size(%d)",
+                        grid_dims.size()));
+  if (config.is_runtime || grid_dims[3] > 0) {
+    PADDLE_ENFORCE_EQ(
+        grid_dims[3],
+        2,
+        phi::errors::InvalidArgument(
+            "Input(Grid) dimension[3] should be 2, but received %d",
+            grid_dims[3]));
+  }
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        grid_dims[0],
+        x_dims[0],
+        phi::errors::InvalidArgument(
+            "Input(X) and Input(Grid) dimension[0] should be equal, but "
+            "received X dimension[0](%d) != Grid dimension[0](%d)",
+            x_dims[0],
+            grid_dims[0]));
+  }
+
+  out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]});
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
 void HuberLossInferMeta(const MetaTensor& input,
                         const MetaTensor& label,
                         float delta,
@@ -548,6 +1084,67 @@ void IndexSampleInferMeta(const MetaTensor& x,
   out->share_lod(y);
 }
 
+void IndexSelectInferMeta(const MetaTensor& x,
+                          const MetaTensor& index,
+                          int dim,
+                          MetaTensor* output) {
+  auto input_dim = x.dims();
+  auto index_dim = index.dims();
+
+  PADDLE_ENFORCE_EQ(
+      dim < input_dim.size() && dim >= (0 - input_dim.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(dim) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
+          input_dim.size(),
+          input_dim.size() - 1,
+          dim));
+
+  PADDLE_ENFORCE_EQ(
+      index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1),
+      true,
+      phi::errors::InvalidArgument(
+          "The 'shape' of Input(Index) must be 1-D tensor. "
+          "But received: the 'shape' of Input(Index) is [%s], "
+          "the dimension of Input(Index) is [%d].",
+          index_dim,
+          index_dim.size()));
+
+  PADDLE_ENFORCE_EQ(
+      index_dim[0] != 0,
+      true,
+      phi::errors::InvalidArgument("The length of Input(Index) can't be 0."));
+
+  auto output_dim = phi::vectorize(input_dim);
+  if (dim < 0) {
+    dim += input_dim.size();
+  }
+  output_dim[dim] = index_dim[0];
+  output->set_dims(phi::make_ddim(output_dim));
+  output->set_dtype(x.dtype());
+  output->set_layout(x.layout());
+  output->share_lod(x);
+}
+
+void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  auto dim_x = x.dims();
+  auto dim_y = y.dims();
+  auto rank_x = dim_x.size();
+  auto rank_y = dim_y.size();
+  auto rank = (rank_x > rank_y) ? rank_x : rank_y;
+
+  std::vector<int64_t> dim_out;
+  dim_out.reserve(rank);
+  for (int i = 0; i < rank; i++) {
+    int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x));
+    int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y));
+    dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi);
+  }
+  out->set_dims(phi::make_ddim(dim_out));
+  out->set_dtype(x.dtype());
+}
+
 void LogLossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       float epsilon,
@@ -590,6 +1187,13 @@ void LogLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void MaskedSelectInferMeta(const MetaTensor& x,
+                           const MetaTensor& mask,
+                           MetaTensor* out) {
+  out->set_dims({-1});  // can not infer
+  out->set_dtype(x.dtype());
+}
+
 void MatmulInferMeta(const MetaTensor& x,
                      const MetaTensor& y,
                      bool trans_x,
@@ -693,6 +1297,157 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
   out->share_lod(x);
 }
 
+void PReluInferMeta(const MetaTensor& x,
+                    const MetaTensor& alpha,
+                    const std::string& mode,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config) {
+  auto x_dim = x.dims();
+  if (mode == "all") {
+    PADDLE_ENFORCE_EQ(phi::product(alpha.dims()),
+                      1,
+                      phi::errors::InvalidArgument(
+                          "For mode 'all', size of weight Alpha must be one. "
+                          "But recevied alpha's size: %d.",
+                          product(alpha.dims())));
+  } else if (mode == "channel") {
+    auto x_rank = x_dim.size();
+    PADDLE_ENFORCE_GE(x_rank,
+                      2,
+                      phi::errors::InvalidArgument(
+                          "For mode 'channel', rank of input X must be "
+                          "equal or larger than 2. But recevied X's "
+                          "rank: %d",
+                          x_rank));
+    PADDLE_ENFORCE_EQ(data_format == "NCHW" || data_format == "NHWC",
+                      true,
+                      phi::errors::InvalidArgument(
+                          "For mode 'channel', data_format must be one of "
+                          "NCHW and NHWC. But recevied data_format: %s",
+                          data_format));
+    if (data_format == "NCHW" || config.is_run_mkldnn_kernel) {
+      PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[1],
+                        true,
+                        phi::errors::InvalidArgument(
+                            "For mode 'channel', size of weight Alpha must be "
+                            "equal to the number of channels of input(x). But "
+                            "recevied alpha's size: %d, x_dim[1]: %d",
+                            product(alpha.dims()),
+                            x_dim[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[x_rank - 1],
+                        true,
+                        phi::errors::InvalidArgument(
+                            "For mode 'channel', size of weight Alpha must be "
+                            "equal to the number of channels of input(x). But "
+                            "recevied alpha's size: %d, x_dim[%d]: %d",
+                            product(alpha.dims()),
+                            x_rank - 1,
+                            x_dim[x_rank - 1]));
+    }
+  } else if (mode == "element") {
+    auto alpha_dim = alpha.dims();
+    auto alpha_rank = alpha_dim.size();
+    auto x_rank = x_dim.size();
+    PADDLE_ENFORCE_GE(x_rank,
+                      1,
+                      phi::errors::InvalidArgument(
+                          "For mode 'element', rank of input X must be "
+                          "equal or larger than 2. But recevied X's "
+                          "rank: %d",
+                          x_rank));
+    PADDLE_ENFORCE_EQ(
+        alpha_rank,
+        x_rank,
+        phi::errors::InvalidArgument(
+            "For mode 'element', rank of weight Alpha must be ",
+            "equal to the rank of input(x). But recevied alpha's rank: %d, "
+            "x's rank: %d.",
+            alpha_rank,
+            x_rank));
+    size_t x_product = 1;
+    size_t alpha_product = 1;
+    for (int64_t i = x_rank - 1; i > 0; i--) {
+      x_product *= x_dim[i];
+      alpha_product *= alpha_dim[i];
+    }
+    PADDLE_ENFORCE_EQ(
+        alpha_product,
+        x_product,
+        phi::errors::InvalidArgument(
+            "For mode 'element', the size of weight Alpha must be "
+            "equal to the size of input(x). But recevied alpha's size: %d, "
+            "x's size: %d.",
+            alpha_product,
+            x_product));
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. "
+        "But recevied "
+        "mode: '%s'.",
+        mode));
+  }
+  out->set_dims(x_dim);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+}
+
+void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
+                           const MetaTensor& value,
+                           bool out_int32,
+                           bool right,
+                           MetaTensor* out) {
+  auto sequences_dims = sorted_sequence.dims();
+  auto values_dims = value.dims();
+
+  bool flag = true;
+  if (sequences_dims.size() != values_dims.size()) {
+    flag = false;
+  }
+  const auto& sequences_dims_size = sequences_dims.size();
+  for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) {
+    if (sequences_dims[dim] != values_dims[dim]) {
+      flag = false;
+      break;
+    }
+  }
+  if (sequences_dims.size() != 1) {
+    PADDLE_ENFORCE_EQ(
+        flag,
+        true,
+        phi::errors::Unavailable(
+            "The dimensions of sorted_sequence tensor ( %s ) and values "
+            "tensor ( %s ) can not match. Because the input sorted_sequence "
+            "tensor must be 1 dimension or the first N-1 dimensions of "
+            "sorted_sequence tensor and input values tensor must match. "
+            "Please input appropriate sorted_sequence and values again! ",
+            sequences_dims,
+            values_dims));
+  }
+
+  if (out_int32) {
+    PADDLE_ENFORCE_LT(
+        sequences_dims[sequences_dims.size() - 1],
+        std::numeric_limits<int>::max(),
+        phi::errors::Unavailable(
+            "The size of sorted_sequence %d exceed the maximum limit d%. "
+            "Because the size of sorted_sequence should be less than the "
+            "output maximum value for int32 bit. Please set appropriate "
+            "sorted_sequence to meet this requirement! ",
+            sequences_dims[sequences_dims.size() - 1],
+            std::numeric_limits<int>::max()));
+  }
+
+  out->set_dims(values_dims);
+  if (out_int32) {
+    out->set_dtype(DataType::INT32);
+  } else {
+    out->set_dtype(DataType::INT64);
+  }
+}
+
 void SegmentPoolInferMeta(const MetaTensor& x,
                           const MetaTensor& segment_ids,
                           const std::string& pooltype,
@@ -812,4 +1567,130 @@ void TriangularSolveInferMeta(const MetaTensor& x,
   out->share_lod(y);
 }
 
+void YoloBoxInferMeta(const MetaTensor& x,
+                      const MetaTensor& img_size,
+                      const std::vector<int>& anchors,
+                      int class_num,
+                      float conf_thresh,
+                      int downsample_ratio,
+                      bool clip_bbox,
+                      float scale_x_y,
+                      bool iou_aware,
+                      float iou_aware_factor,
+                      MetaTensor* boxes,
+                      MetaTensor* scores,
+                      MetaConfig config) {
+  auto dim_x = x.dims();
+  auto dim_imgsize = img_size.dims();
+  int anchor_num = anchors.size() / 2;
+
+  PADDLE_ENFORCE_EQ(
+      dim_x.size(),
+      4,
+      phi::errors::InvalidArgument("Input(X) should be a 4-D tensor."
+                                   "But received X dimension(%s)",
+                                   dim_x.size()));
+  if (iou_aware) {
+    PADDLE_ENFORCE_EQ(
+        dim_x[1],
+        anchor_num * (6 + class_num),
+        phi::errors::InvalidArgument(
+            "Input(X) dim[1] should be equal to (anchor_mask_number * (6 "
+            "+ class_num)) while iou_aware is true."
+            "But received dim[1](%s) != (anchor_mask_number * "
+            "(6+class_num)(%s).",
+            dim_x[1],
+            anchor_num * (6 + class_num)));
+    PADDLE_ENFORCE_GE(
+        iou_aware_factor,
+        0,
+        phi::errors::InvalidArgument(
+            "Attr(iou_aware_factor) should greater than or equal to 0."
+            "But received iou_aware_factor (%s)",
+            iou_aware_factor));
+    PADDLE_ENFORCE_LE(
+        iou_aware_factor,
+        1,
+        phi::errors::InvalidArgument(
+            "Attr(iou_aware_factor) should less than or equal to 1."
+            "But received iou_aware_factor (%s)",
+            iou_aware_factor));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        dim_x[1],
+        anchor_num * (5 + class_num),
+        phi::errors::InvalidArgument(
+            "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+            "+ class_num))."
+            "But received dim[1](%s) != (anchor_mask_number * "
+            "(5+class_num)(%s).",
+            dim_x[1],
+            anchor_num * (5 + class_num)));
+  }
+  PADDLE_ENFORCE_EQ(
+      dim_imgsize.size(),
+      2,
+      phi::errors::InvalidArgument("Input(ImgSize) should be a 2-D tensor."
+                                   "But received Imgsize size(%s)",
+                                   dim_imgsize.size()));
+  if ((dim_imgsize[0] > 0 && dim_x[0] > 0) || config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        dim_imgsize[0],
+        dim_x[0],
+        phi::errors::InvalidArgument(
+            "Input(ImgSize) dim[0] and Input(X) dim[0] should be same."));
+  }
+  PADDLE_ENFORCE_EQ(
+      dim_imgsize[1],
+      2,
+      phi::errors::InvalidArgument("Input(ImgSize) dim[1] should be 2."
+                                   "But received imgsize dim[1](%s).",
+                                   dim_imgsize[1]));
+  PADDLE_ENFORCE_GT(anchors.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Attr(anchors) length should be greater than 0."
+                        "But received anchors length(%s).",
+                        anchors.size()));
+  PADDLE_ENFORCE_EQ(anchors.size() % 2,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Attr(anchors) length should be even integer."
+                        "But received anchors length (%s)",
+                        anchors.size()));
+  PADDLE_ENFORCE_GT(class_num,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Attr(class_num) should be an integer greater than 0."
+                        "But received class_num (%s)",
+                        class_num));
+
+  int box_num;
+  if ((dim_x[2] > 0 && dim_x[3] > 0) || config.is_runtime) {
+    box_num = dim_x[2] * dim_x[3] * anchor_num;
+  } else {
+    box_num = -1;
+  }
+  std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
+  boxes->set_dims(phi::make_ddim(dim_boxes));
+  boxes->set_dtype(x.dtype());
+
+  std::vector<int64_t> dim_scores({dim_x[0], box_num, class_num});
+  scores->set_dims(phi::make_ddim(dim_scores));
+}
+
+void ValueCompareInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           MetaTensor* out,
+                           MetaConfig config) {
+  detail::BinarySameInputDimsCheck(x, y, config);
+
+  out->set_dims(x.dims());
+  out->set_dtype(DataType::BOOL);
+}
+
 }  // namespace phi
+
+PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta);
+PD_REGISTER_INFER_META_FN(conv2d, phi::ConvInferMeta);
+PD_REGISTER_INFER_META_FN(conv2d_infer, phi::ConvInferInferMeta);
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index cfae45cf04b87..f9a9398437753 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/meta_tensor.h"
 
 namespace phi {
@@ -28,12 +29,20 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 
 void AllValueCompareInferMeta(const MetaTensor& x,
                               const MetaTensor& y,
                               MetaTensor* out,
                               MetaConfig config = MetaConfig());
 
+void KLDivInferMeta(const MetaTensor& x,
+                    const MetaTensor& label,
+                    const std::string& reduction,
+                    MetaTensor* out,
+                    MetaConfig config = MetaConfig());
+
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 
 void BCELossInferMeta(const MetaTensor& input,
@@ -60,6 +69,44 @@ void CompareInferMeta(const MetaTensor& x,
                       int axis,
                       MetaTensor* out);
 
+void ConvInferMeta(const MetaTensor& input,
+                   const MetaTensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   const std::string& paddding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search,
+                   MetaTensor* out,
+                   MetaConfig config = MetaConfig());
+
+void ConvInferInferMeta(const MetaTensor& input,
+                        const MetaTensor& filter,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        const std::string& paddding_algorithm,
+                        int groups,
+                        const std::vector<int>& dilations,
+                        const std::string& data_format,
+                        MetaTensor* out,
+                        MetaConfig config = MetaConfig());
+
+void ConvTransposeInferMeta(const MetaTensor& x,
+                            const MetaTensor& filter,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            const std::vector<int>& output_padding,
+                            const std::vector<int>& output_size,
+                            const std::string& padding_algorithm,
+                            int groups,
+                            const std::vector<int>& dilations,
+                            const std::string& data_format,
+                            MetaTensor* out,
+                            MetaConfig config = MetaConfig());
+
 void CrossInferMeta(const MetaTensor& x,
                     const MetaTensor& y,
                     int axis,
@@ -81,6 +128,16 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              int axis,
                              MetaTensor* out);
 
+void ExpandAsInferMeta(const MetaTensor& x,
+                       paddle::optional<const MetaTensor&> y,
+                       const std::vector<int>& target_shape,
+                       MetaTensor* out);
+
+void GatherInferMeta(const MetaTensor& x,
+                     const MetaTensor& index,
+                     const Scalar& axis,
+                     MetaTensor* out);
+
 void GatherNdInferMeta(const MetaTensor& x,
                        const MetaTensor& index,
                        MetaTensor* out);
@@ -89,6 +146,11 @@ void GatherTreeMeta(const MetaTensor& ids,
                     const MetaTensor& parents,
                     MetaTensor* out);
 
+void GridSampleBaseInferMeta(const MetaTensor& x,
+                             const MetaTensor& grid,
+                             MetaTensor* out,
+                             MetaConfig config = MetaConfig());
+
 void HuberLossInferMeta(const MetaTensor& input_meta,
                         const MetaTensor& label_meta,
                         float delta,
@@ -101,12 +163,23 @@ void IndexSampleInferMeta(const MetaTensor& x,
                           MetaTensor* out,
                           MetaConfig config = MetaConfig());
 
+void IndexSelectInferMeta(const MetaTensor& x,
+                          const MetaTensor& index,
+                          int dim,
+                          MetaTensor* output);
+
+void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
+
 void LogLossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       float epsilon,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
 
+void MaskedSelectInferMeta(const MetaTensor& x,
+                           const MetaTensor& mask,
+                           MetaTensor* out);
+
 void MatmulInferMeta(const MetaTensor& x,
                      const MetaTensor& y,
                      bool trans_x,
@@ -115,6 +188,19 @@ void MatmulInferMeta(const MetaTensor& x,
 
 void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out);
 
+void PReluInferMeta(const MetaTensor& x,
+                    const MetaTensor& alpha,
+                    const std::string& mode,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config);
+
+void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
+                           const MetaTensor& value,
+                           bool out_int32,
+                           bool right,
+                           MetaTensor* out);
+
 void SegmentPoolInferMeta(const MetaTensor& x,
                           const MetaTensor& segment_ids,
                           const std::string& pooltype,
@@ -136,4 +222,23 @@ void TriangularSolveInferMeta(const MetaTensor& x,
                               bool unitriangular,
                               MetaTensor* out);
 
+void YoloBoxInferMeta(const MetaTensor& x,
+                      const MetaTensor& img_size,
+                      const std::vector<int>& anchors,
+                      int class_num,
+                      float conf_thresh,
+                      int downsample_ratio,
+                      bool clip_bbox,
+                      float scale_x_y,
+                      bool iou_aware,
+                      float iou_aware_factor,
+                      MetaTensor* boxes,
+                      MetaTensor* scores,
+                      MetaConfig config = MetaConfig());
+
+void ValueCompareInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           MetaTensor* out,
+                           MetaConfig config = MetaConfig());
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 84441ed8b740b..3faf42fe1ab1a 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/infermeta/multiary.h"
 #include <vector>
+#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 namespace phi {
@@ -200,6 +202,151 @@ void AucInferMeta(const MetaTensor& input,
   }
 }
 
+void BatchNormInferMeta(const MetaTensor& x,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
+                        const MetaTensor& mean,
+                        const MetaTensor& variance,
+                        float momentum,
+                        float epsilon,
+                        const std::string& data_layout_str,
+                        bool is_test,
+                        bool use_global_stats,
+                        bool trainable_statistics,
+                        bool fuse_with_relu,
+                        MetaTensor* y,
+                        MetaTensor* mean_out,
+                        MetaTensor* variance_out,
+                        MetaTensor* saved_mean,
+                        MetaTensor* saved_variance,
+                        MetaTensor* reserve_space,
+                        MetaConfig config) {
+  const auto x_dims = x.dims();
+  for (int i = 0; i < x_dims.size(); i++) {
+    PADDLE_ENFORCE_EQ(
+        (x_dims[i] == -1) || (x_dims[i] > 0),
+        true,
+        phi::errors::InvalidArgument(
+            "Each dimension of input tensor is expected to be -1 or a "
+            "positive number, but recieved %d. Input's shape is [%s].",
+            x_dims[i],
+            x_dims));
+  }
+
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input "
+          "X must greater than or equal to 2. But received: the shape of input "
+          "X = [%s], the dimension of input X =[%d]",
+          x_dims,
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input X "
+          "must smaller than or equal to 5. But received: the shape of input X "
+          "= [%s], the dimension of input X = [%d]",
+          x_dims,
+          x_dims.size()));
+
+  const int64_t C = ((config.is_run_mkldnn_kernel == true) ||
+                             (data_layout == DataLayout::kNCHW)
+                         ? x_dims[1]
+                         : x_dims[x_dims.size() - 1]);
+  auto scale_dim = scale.dims();
+  auto bias_dim = bias.dims();
+
+  PADDLE_ENFORCE_EQ(
+      scale_dim.size(),
+      1UL,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of scale must equal to 1."
+          "But received: the shape of scale is [%s], the dimension "
+          "of scale is [%d]",
+          scale_dim,
+          scale_dim.size()));
+  PADDLE_ENFORCE_EQ(bias_dim.size(),
+                    1UL,
+                    phi::errors::InvalidArgument(
+                        "ShapeError: the dimension of bias must equal to 1."
+                        "But received: the shape of bias is [%s],the dimension "
+                        "of bias is [%d]",
+                        bias_dim,
+                        bias_dim.size()));
+
+  bool check = true;
+  if ((!config.is_runtime) &&
+      (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(scale_dim[0],
+                      C,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: the shape of scale must equal to [%d]"
+                          "But received: the shape of scale is [%d]",
+                          C,
+                          scale_dim[0]));
+    PADDLE_ENFORCE_EQ(bias_dim[0],
+                      C,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: the shape of bias must equal to [%d]"
+                          "But received: the shape of bias is [%d]",
+                          C,
+                          bias_dim[0]));
+  }
+  y->set_dims(x_dims);
+  mean_out->set_dims({C});
+  variance_out->set_dims({C});
+  if (saved_mean) {
+    saved_mean->set_dims({C});
+  }
+  if (saved_variance) {
+    saved_variance->set_dims({C});
+  }
+  y->share_lod(x);
+}
+
+void BatchNormInferInferMeta(const MetaTensor& x,
+                             const MetaTensor& scale,
+                             const MetaTensor& bias,
+                             const MetaTensor& mean,
+                             const MetaTensor& variance,
+                             float momentum,
+                             float epsilon,
+                             const std::string& data_layout,
+                             MetaTensor* y,
+                             MetaTensor* mean_out,
+                             MetaTensor* variance_out,
+                             MetaConfig config) {
+  BatchNormInferMeta(x,
+                     scale,
+                     bias,
+                     mean,
+                     variance,
+                     momentum,
+                     epsilon,
+                     data_layout,
+                     /*is_test=*/true,
+                     /*use_global_stats=*/false,
+                     /*trainable_statistics=*/false,
+                     /*fuse_with_relu=*/false,
+                     y,
+                     mean_out,
+                     variance_out,
+                     /*saved_mean=*/nullptr,
+                     /*saved_variance=*/nullptr,
+                     /*reserve_space=*/nullptr,
+                     config);
+}
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -369,6 +516,113 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
   out->share_lod(*x.at(0));
 }
 
+void HierarchicalSigmoidInferMeta(const MetaTensor& x,
+                                  const MetaTensor& w,
+                                  const MetaTensor& label,
+                                  paddle::optional<const MetaTensor&> path,
+                                  paddle::optional<const MetaTensor&> code,
+                                  paddle::optional<const MetaTensor&> bias,
+                                  int num_classes,
+                                  bool remote_prefetch,
+                                  int trainer_id,
+                                  const std::vector<int64_t>& height_sections,
+                                  const std::vector<std::string>& epmap,
+                                  const std::vector<std::string>& table_names,
+                                  bool is_sparse,
+                                  MetaTensor* out,
+                                  MetaTensor* pre_out,
+                                  MetaTensor* w_out) {
+  const int64_t input_dims = x.dims()[0];
+  const int64_t label_dims = label.dims()[0];
+  PADDLE_ENFORCE_EQ(input_dims,
+                    label_dims,
+                    phi::errors::InvalidArgument(
+                        "The first dimension of "
+                        "input and label is expected to be the same. "
+                        "But received input's first dimension is %d; "
+                        "label's first dimension is %d.",
+                        input_dims,
+                        label_dims));
+
+  std::vector<int64_t> output_shape({input_dims, 1});
+  out->set_dims(phi::make_ddim(output_shape));
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out) {
+  auto inputs_dims = GetMetaTensorsDim(x);
+
+  const size_t inputs_num = inputs_dims.size();
+  PADDLE_ENFORCE_GT(
+      inputs_num,
+      static_cast<size_t>(1),
+      phi::errors::InvalidArgument(
+          "The number of input tensors in multi_dot op should > 1."));
+
+  const size_t n = inputs_dims.size();
+  auto first_dim = inputs_dims[0];
+
+  bool is_vector = false;
+  phi::DDim out_dim;
+
+  PADDLE_ENFORCE_LT(
+      first_dim.size(),
+      static_cast<size_t>(3),
+      phi::errors::InvalidArgument(
+          "multi_dot: the first input tensor must be 1D or 2D but got[%d]!",
+          static_cast<int>(first_dim.size())));
+
+  // If the first tensor is 1D of size n view it as a row vector (1, n)
+  if (first_dim.size() == 1) {
+    first_dim = phi::make_ddim({1, static_cast<int>(first_dim[0])});
+    is_vector = true;
+  }
+
+  auto last_dim = inputs_dims[n - 1];
+  PADDLE_ENFORCE_LT(
+      last_dim.size(),
+      static_cast<size_t>(3),
+      phi::errors::InvalidArgument(
+          "the last input tensor of multi_dot must be 1D or 2D but got[%d]!",
+          static_cast<int>(first_dim.size())));
+
+  // If the last tensor is 1D of size n view it as a column vector (n, 1)
+  if (last_dim.size() == 1) {
+    last_dim = phi::make_ddim({static_cast<int>(last_dim[0]), 1});
+    out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]});
+  } else {
+    out_dim = is_vector ? phi::make_ddim({last_dim[1]})
+                        : phi::make_ddim({first_dim[0], last_dim[1]});
+  }
+
+  auto width = first_dim[1];
+  for (size_t i = 1; i < n - 1; i++) {
+    PADDLE_ENFORCE_EQ(inputs_dims[i].size(),
+                      static_cast<size_t>(2),
+                      phi::errors::InvalidArgument(
+                          "the input tensor of multi_dot op must be 2D."));
+
+    const auto& tmp_dim = inputs_dims[i];
+    PADDLE_ENFORCE_EQ(
+        tmp_dim[0],
+        width,
+        phi::errors::InvalidArgument(
+            "the input matrix does not meet the multiplication requirements."));
+    width = tmp_dim[1];
+  }
+
+  PADDLE_ENFORCE_EQ(
+      last_dim[0],
+      width,
+      phi::errors::InvalidArgument(
+          "the input matrix does not meet the multiplication requirements."));
+
+  out->set_dims(out_dim);
+  out->set_dtype(x.at(0)->dtype());
+  out->share_lod(*x.at(0));
+}
+
 void PsroiPoolInferMeta(const MetaTensor& x,
                         const MetaTensor& rois,
                         paddle::optional<const MetaTensor&> rois_num,
@@ -470,3 +724,6 @@ void WhereInferMeta(const MetaTensor& condition,
 }
 
 }  // namespace phi
+
+PD_REGISTER_INFER_META_FN(batch_norm, phi::BatchNormInferMeta);
+PD_REGISTER_INFER_META_FN(batch_norm_infer, phi::BatchNormInferInferMeta);
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index c11843212ed33..e9b5d8c872fb9 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,6 +18,23 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
 
+// Common InferMeta Functions for multiary operators, The format like:
+//
+//   1. The number of input MetaTensor is more than 3:
+//      void [FunctionDesc|OpName]InferMeta(const MetaTensor& x,
+//                                          const MetaTensor& y,
+//                                          const MetaTensor& z,
+//                                          const MetaTensor& w,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+//   2. There are `const vector<MetaTensor*>&` in params:
+//      void [FunctionDesc|OpName]InferMeta(const vector<MetaTensor*>& x,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
+
 std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
 
 void AdadeltaInferMeta(const MetaTensor& param,
@@ -55,6 +72,39 @@ void AucInferMeta(const MetaTensor& input,
                   MetaTensor* stat_neg_out,
                   MetaConfig config = MetaConfig());
 
+void BatchNormInferMeta(const MetaTensor& x,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
+                        const MetaTensor& mean,
+                        const MetaTensor& variance,
+                        float momentum,
+                        float epsilon,
+                        const std::string& data_layout,
+                        bool is_test,
+                        bool use_global_stats,
+                        bool trainable_statistics,
+                        bool fuse_with_relu,
+                        MetaTensor* y,
+                        MetaTensor* mean_out,
+                        MetaTensor* variance_out,
+                        MetaTensor* saved_mean,
+                        MetaTensor* saved_variance,
+                        MetaTensor* reserve_space,
+                        MetaConfig config = MetaConfig());
+
+void BatchNormInferInferMeta(const MetaTensor& x,
+                             const MetaTensor& scale,
+                             const MetaTensor& bias,
+                             const MetaTensor& mean,
+                             const MetaTensor& variance,
+                             float momentum,
+                             float epsilon,
+                             const std::string& data_layout,
+                             MetaTensor* y,
+                             MetaTensor* mean_out,
+                             MetaTensor* variance_out,
+                             MetaConfig config = MetaConfig());
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -70,6 +120,25 @@ void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void HierarchicalSigmoidInferMeta(const MetaTensor& x,
+                                  const MetaTensor& w,
+                                  const MetaTensor& label,
+                                  paddle::optional<const MetaTensor&> path,
+                                  paddle::optional<const MetaTensor&> code,
+                                  paddle::optional<const MetaTensor&> bias,
+                                  int num_classes,
+                                  bool remote_prefetch,
+                                  int trainer_id,
+                                  const std::vector<int64_t>& height_sections,
+                                  const std::vector<std::string>& epmap,
+                                  const std::vector<std::string>& table_names,
+                                  bool is_sparse,
+                                  MetaTensor* out,
+                                  MetaTensor* pre_out,
+                                  MetaTensor* w_out);
+
+void MultiDotInferMeta(const std::vector<MetaTensor*>& x, MetaTensor* out);
+
 void PsroiPoolInferMeta(const MetaTensor& x,
                         const MetaTensor& rois,
                         paddle::optional<const MetaTensor&> rois_num,
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index 38eaa636f8c87..55e59b27e71cf 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -27,6 +27,8 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 
 void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);
 
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 235cfe368c192..a72b8d913f81d 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -145,6 +145,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                             const MetaTensor& src_index,
                             const MetaTensor& dst_index,
                             const std::string& pool_type,
+                            int64_t out_size,
                             MetaTensor* out,
                             MetaTensor* dst_count) {
   auto src_index_dims = src_index.dims();
@@ -187,11 +188,23 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                         "Src_index and Dst_index should have the same shape."));
 
   auto dims = x.dims();
-  out->set_dims(dims);
+  if (out_size <= 0) {
+    out->set_dims(dims);
+  } else {
+    std::vector<int64_t> dims_ = phi::vectorize(dims);
+    if (dims_.size() > 0) {
+      dims_[0] = out_size;
+    }
+    out->set_dims(phi::make_ddim(dims_));
+  }
   out->set_dtype(x.dtype());
 
   if (pool_type == "MEAN") {
-    dst_count->set_dims({dims[0]});
+    if (out_size <= 0) {
+      dst_count->set_dims({dims[0]});
+    } else {
+      dst_count->set_dims({out_size});
+    }
     dst_count->set_dtype(DataType::INT32);
   }
 }
@@ -322,6 +335,158 @@ void NllLossRawInferMeta(const MetaTensor& input,
   total_weight->set_dtype(input.dtype());
 }
 
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config) {
+  auto input_dims = x.dims();
+  auto boxes_dims = boxes.dims();
+
+  if (boxes_num) {
+    auto boxes_num_dims = boxes_num->dims();
+    PADDLE_ENFORCE_EQ(
+        boxes_num_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The size of boxes_num should be 1"
+                                     ", but received size = %d",
+                                     boxes_num_dims.size()));
+  }
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The format of Input(x) in"
+                        "RoiAlignOp is NCHW. And the rank of input must be 4. "
+                        "But received rank = %d",
+                        input_dims.size()));
+  PADDLE_ENFORCE_EQ(boxes_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument("The rank of Input(boxes) "
+                                                 "in RoiAlignOp should be 2. "
+                                                 "But the rank of boxes is %d",
+                                                 boxes_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(boxes_dims[1],
+                      4,
+                      phi::errors::InvalidArgument(
+                          "The second dimension "
+                          "of Input(boxes) should be 4. But received the "
+                          "dimension = %d",
+                          boxes_dims[1]));
+  }
+
+  PADDLE_ENFORCE_GT(pooled_height,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_height' attribute in RoiAlignOp is "
+                        "invalid. The height must be greater than 0. But "
+                        "received 'pooled_height' = %d",
+                        pooled_height));
+  PADDLE_ENFORCE_GT(pooled_width,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_width' attribute in RoiAlignOp is "
+                        "invalid. The width must be greater than 0. But "
+                        "received 'pooled_width' = %d",
+                        pooled_width));
+  PADDLE_ENFORCE_GT(spatial_scale,
+                    0.0f,
+                    phi::errors::InvalidArgument(
+                        "The 'spatial_scale' attribute in RoiAlignOp is "
+                        "invalid. The scale must be greater than 0. But "
+                        "received 'spatial_scale' = %f",
+                        spatial_scale));
+
+  auto out_dims = input_dims;
+  out_dims[0] = boxes_dims[0];
+  out_dims[1] = input_dims[1];
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
+
+void RoiPoolInferMeta(const MetaTensor& x,
+                      const MetaTensor& boxes,
+                      paddle::optional<const MetaTensor&> boxes_num,
+                      int pooled_height,
+                      int pooled_width,
+                      float spatial_scale,
+                      MetaTensor* out,
+                      MetaTensor* arg_max) {
+  auto input_dims = x.dims();
+  auto boxes_dims = boxes.dims();
+
+  if (boxes_num) {
+    auto boxes_num_dims = boxes_num->dims();
+    PADDLE_ENFORCE_EQ(
+        boxes_num_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The second dimension of boxes_num should "
+                                     "be 1, but received dimension is %d",
+                                     boxes_num_dims.size()));
+  }
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The input data should be a four-dimensional "
+                        "tensor with [N,C,H,W], but received input data with "
+                        " %d dimension",
+                        input_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      boxes_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)"
+          "given as [[x1, y1, x2, y2], ...], but received boxes is "
+          "%d-dimensional LoDTensor",
+          boxes_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      boxes_dims[1],
+      4,
+      phi::errors::InvalidArgument(
+          "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)"
+          "given as [[x1, y1, x2, y2], ...]. But the second dimension of  "
+          "the received data is %d",
+          boxes_dims[1]));
+
+  PADDLE_ENFORCE_GT(
+      pooled_height,
+      0,
+      phi::errors::OutOfRange("The pooled output height must be greater than 0"
+                              "but received height is %d",
+                              pooled_height));
+  PADDLE_ENFORCE_GT(
+      pooled_width,
+      0,
+      phi::errors::OutOfRange("The pooled output width must be greater than 0"
+                              "but received width is %d",
+                              pooled_width));
+  PADDLE_ENFORCE_GT(
+      spatial_scale,
+      0.0f,
+      phi::errors::OutOfRange("The spatial scale must be greater than 0, "
+                              "but received spatial scale is %f",
+                              spatial_scale));
+
+  auto out_dims = input_dims;
+  out_dims[0] = boxes_dims[0];
+  out_dims[1] = input_dims[1];
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+  arg_max->set_dims(out_dims);
+  arg_max->set_dtype(DataType::INT64);
+}
+
 void ScatterInferMeta(const MetaTensor& x,
                       const MetaTensor& index,
                       const MetaTensor& updates,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 209a07db18b5c..8521a1ee855c6 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -30,6 +30,8 @@ namespace phi {
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
 //
+// The InferMeta Functions in this file are arranged in alphabetic order.
+
 void AccuracyInferMeta(const MetaTensor& out,
                        const MetaTensor& indice,
                        const MetaTensor& label,
@@ -49,6 +51,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                             const MetaTensor& src_index,
                             const MetaTensor& dst_index,
                             const std::string& pool_type,
+                            int64_t out_size,
                             MetaTensor* out,
                             MetaTensor* dst_count);
 
@@ -71,6 +74,26 @@ void NllLossRawInferMeta(const MetaTensor& input,
                          MetaTensor* total_weight,
                          MetaConfig config = MetaConfig());
 
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config = MetaConfig());
+
+void RoiPoolInferMeta(const MetaTensor& x,
+                      const MetaTensor& boxes,
+                      paddle::optional<const MetaTensor&> boxes_num,
+                      int pooled_height,
+                      int pooled_width,
+                      float spatial_scale,
+                      MetaTensor* out,
+                      MetaTensor* arg_max);
+
 void ScatterInferMeta(const MetaTensor& x,
                       const MetaTensor& index,
                       const MetaTensor& updates,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index d09a2191fb2d6..e44032285ac1a 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
 
 namespace phi {
 
@@ -304,6 +305,17 @@ void DiagonalInferMeta(const MetaTensor& input,
   out->set_dims(phi::make_ddim(out_dims));
 }
 
+void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask) {
+  auto x_dims = x.dims();
+  out->set_dims(x_dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+
+  if (mask != nullptr) {
+    mask->set_dims(x_dims);
+  }
+}
+
 void EighInferMeta(const MetaTensor& x,
                    const std::string& uplo,
                    MetaTensor* out_w,
@@ -392,6 +404,26 @@ void GumbelSoftmaxInferMeta(const MetaTensor& x,
   UnchangedInferMetaCheckAxis(x, axis, out);
 }
 
+void HistogramInferMeta(
+    const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out) {
+  PADDLE_ENFORCE_GE(bins,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The bins should be greater than or equal to 1."
+                        "But received nbins is %d",
+                        bins));
+  PADDLE_ENFORCE_GE(
+      max,
+      min,
+      phi::errors::InvalidArgument("max must be larger or equal to min."
+                                   "But received max is %d, min is %d",
+                                   max,
+                                   min));
+
+  out->set_dims({bins});
+  out->share_lod(input);
+}
+
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) {
   PADDLE_ENFORCE_EQ(
       product(x.dims()),
@@ -554,6 +586,89 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) {
   out->set_dtype(DataType::BOOL);
 }
 
+void KthvalueInferMeta(const MetaTensor& x,
+                       int k,
+                       int axis,
+                       bool keepdim,
+                       MetaTensor* out,
+                       MetaTensor* indices,
+                       MetaConfig config) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_LT(axis,
+                    dim_size,
+                    phi::errors::InvalidArgument(
+                        "the axis must be [-%d, %d), but received %d .",
+                        dim_size,
+                        dim_size,
+                        axis));
+  PADDLE_ENFORCE_GE(axis,
+                    -dim_size,
+                    phi::errors::InvalidArgument(
+                        "the axis must be [-%d, %d), but received %d .",
+                        dim_size,
+                        dim_size,
+                        axis));
+  if (axis < 0) axis += dim_size;
+  PADDLE_ENFORCE_GE(
+      k,
+      1,
+      phi::errors::InvalidArgument(
+          "the k in the kthvalue must >= 1, but received %d .", k));
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      phi::errors::InvalidArgument("input of kthvalue must have >= 1d shape"));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_GE(
+        input_dims[axis],
+        k,
+        phi::errors::InvalidArgument(
+            "input of kthvalue must have >= %d columns in axis of %d",
+            k,
+            axis));
+  }
+  std::vector<int64_t> dimvec;
+  for (int64_t i = 0; i < axis; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  if (keepdim) {
+    dimvec.emplace_back(static_cast<int64_t>(1));
+  }
+  for (int64_t i = axis + 1; i < dim_size; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  DDim dims = phi::make_ddim(dimvec);
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(x.dtype());
+}
+
+void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) {
+  auto dims = x.dims();
+  auto n_dim = dims.size();
+  PADDLE_ENFORCE_GE(n_dim,
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input(X) should have at least 2 dimensions. But "
+                        "received a %d dimension tensor.",
+                        n_dim));
+  PADDLE_ENFORCE_EQ(dims[n_dim - 2],
+                    dims[n_dim - 1],
+                    phi::errors::InvalidArgument(
+                        "The inner-most 2 dimensions of Input(X) all should "
+                        "be square matrices "
+                        "But received X's shape[-2] = %d and shape[-1] = %d.",
+                        dims[n_dim - 2],
+                        dims[n_dim - 1]));
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
 void MaxPoolWithIndexInferMeta(const MetaTensor& x,
                                const std::vector<int>& kernel_size,
                                const std::vector<int>& strides,
@@ -626,6 +741,49 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
   mask->set_dtype(paddle::experimental::CppTypeToDataType<int>::Type());
 }
 
+void ModeInferMeta(const MetaTensor& x,
+                   int axis,
+                   bool keepdim,
+                   MetaTensor* out,
+                   MetaTensor* indices) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_EQ(
+      (axis < dim_size) && (axis >= (-1 * dim_size)),
+      true,
+      errors::InvalidArgument(
+          "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
+          dim_size,
+          dim_size,
+          axis));
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      errors::InvalidArgument("input of ModeOp must have >= 1d shape"));
+  if (axis < 0) axis += dim_size;
+  std::vector<int64_t> dimvec;
+  for (int64_t i = 0; i < axis; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  if (keepdim) {
+    dimvec.emplace_back(static_cast<int64_t>(1));
+  }
+  for (int64_t i = axis + 1; i < dim_size; i++) {
+    dimvec.emplace_back(input_dims[i]);
+  }
+  DDim dims = phi::make_ddim(dimvec);
+  PADDLE_ENFORCE_GE(input_dims.size(),
+                    1,
+                    errors::InvalidArgument("input shape should >= 1d"));
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(x.dtype());
+}
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
@@ -661,6 +819,24 @@ void MultinomialInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::INT64);
 }
 
+void NormInferMeta(const MetaTensor& x,
+                   int axis,
+                   float epsilon,
+                   bool is_test,
+                   MetaTensor* out,
+                   MetaTensor* norm) {
+  auto xdim = x.dims();
+  out->set_dims(x.dims());
+  out->set_dtype(x.dtype());
+
+  if (is_test == false) {
+    if (axis < 0) axis = xdim.size() + axis;
+    xdim[axis] = 1;
+    norm->set_dims(xdim);
+    norm->set_dtype(x.dtype());
+  }
+}
+
 void PadInferMeta(const MetaTensor& input,
                   const std::vector<int>& paddings,
                   float pad_value,
@@ -702,6 +878,77 @@ void PadInferMeta(const MetaTensor& input,
   out->set_dtype(input.dtype());
 }
 
+void Pad3dInferMeta(const MetaTensor& x,
+                    const ScalarArray& paddings_scalar_array,
+                    const std::string& mode,
+                    float value,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config) {
+  auto x_dim = x.dims();
+  PADDLE_ENFORCE_EQ(x_dim.size(),
+                    5,
+                    errors::InvalidArgument(
+                        "The size of Input(X)'s dimension should be equal to "
+                        "5, but received %d. ",
+                        x_dim.size()));
+
+  std::vector<int64_t> out_dims(x_dim.size());
+  out_dims[0] = x_dim[0];
+  if (paddings_scalar_array.FromTensor()) {
+    if (config.is_runtime) {
+      PADDLE_ENFORCE_EQ(
+          paddings_scalar_array.GetData().size(),
+          6,
+          errors::InvalidArgument("Shape of Input(Paddings) should be equal to "
+                                  "[6], but received [%d].",
+                                  paddings_scalar_array.GetData().size()));
+    }
+    out_dims[1] = x_dim[1];
+    out_dims[2] = x_dim[2];
+    out_dims[3] = x_dim[3];
+  } else {
+    auto paddings = paddings_scalar_array.GetData();
+
+    PADDLE_ENFORCE_EQ(
+        paddings.size(),
+        6,
+        errors::InvalidArgument(
+            "Size of paddings should be equal to 6, but received %d.",
+            static_cast<int>(paddings.size())));
+    if (data_format == "NCDHW") {
+      out_dims[1] = x_dim[1];  // channel
+      out_dims[2] = ((!config.is_runtime) && (x_dim[2] < 0))
+                        ? x_dim[2]
+                        : (x_dim[2] + paddings[4] + paddings[5]);  // depth
+
+      out_dims[3] = ((!config.is_runtime) && (x_dim[3] < 0))
+                        ? x_dim[3]
+                        : (x_dim[3] + paddings[2] + paddings[3]);  // height
+
+      out_dims[4] = ((!config.is_runtime) && (x_dim[4] < 0))
+                        ? x_dim[4]
+                        : (x_dim[4] + paddings[0] + paddings[1]);  // width
+    } else {                                                       // NDHWC
+      out_dims[4] = x_dim[4];                                      // channel
+
+      out_dims[1] = ((!config.is_runtime) && (x_dim[1] < 0))
+                        ? x_dim[1]
+                        : (x_dim[1] + paddings[4] + paddings[5]);  // depth
+      out_dims[2] = ((!config.is_runtime) && (x_dim[2] < 0))
+                        ? x_dim[2]
+                        : (x_dim[2] + paddings[2] + paddings[3]);  // height
+      out_dims[3] = ((!config.is_runtime) && (x_dim[3] < 0))
+                        ? x_dim[3]
+                        : (x_dim[3] + paddings[0] + paddings[1]);  // width
+    }
+  }
+
+  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dtype(x.dtype());
+  out->share_lod(x);
+}
+
 void PixelShuffleInferMeta(const MetaTensor& x,
                            int upscale_factor,
                            const std::string& data_format,
@@ -994,6 +1241,80 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
   ReshapeInferMeta(x, shape, out, config);
 }
 
+void ReverseInferMeta(const MetaTensor& x,
+                      const std::vector<int>& axis,
+                      MetaTensor* out) {
+  PADDLE_ENFORCE_NE(axis.empty(),
+                    true,
+                    phi::errors::InvalidArgument("'axis' can not be empty."));
+  const auto& x_dims = x.dims();
+  for (int a : axis) {
+    PADDLE_ENFORCE_LT(a,
+                      x_dims.size(),
+                      phi::errors::OutOfRange(
+                          "The axis must be less than input tensor's rank. "
+                          "but got %d >= %d",
+                          a,
+                          x_dims.size()));
+    PADDLE_ENFORCE_GE(
+        a,
+        -x_dims.size(),
+        phi::errors::OutOfRange(
+            "The axis must be greater than the negative number of "
+            "input tensor's rank, but got %d < %d",
+            a,
+            -x_dims.size()));
+  }
+  out->share_meta(x);
+}
+
+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out) {
+  auto shifts_data = shifts.GetData();
+
+  if (axis.size() != 0) {
+    PADDLE_ENFORCE_EQ(
+        axis.size(),
+        shifts_data.size(),
+        phi::errors::InvalidArgument("When dims.size() != 0, dims.size() "
+                                     "should be equal to "
+                                     "shifts.size(). But received "
+                                     "dims.size() = %d, shifts.size() = %d",
+                                     axis.size(),
+                                     shifts_data.size()));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        shifts_data.size(),
+        1,
+        phi::errors::InvalidArgument("When dims.size() == 0, shifts.size() "
+                                     "should be equal to 1, But received "
+                                     "shifts.size() = %d",
+                                     shifts_data.size()));
+  }
+
+  out->set_dims(x.dims());
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+}
+
+void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) {
+  auto in_dims = x.dims();
+  PADDLE_ENFORCE_LT(
+      in_dims.size(),
+      7,
+      phi::errors::InvalidArgument(
+          "The rank of input should be less than 7, but received %d.",
+          in_dims.size()));
+}
+
+void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) {
+  auto in_dim = input.dims();
+  out->set_dims(phi::make_ddim({in_dim.size()}));
+  out->set_dtype(DataType::INT32);
+}
+
 void ShardIndexInferMeta(const MetaTensor& in,
                          int index_num,
                          int nshards,
@@ -1177,6 +1498,40 @@ void SplitInferMeta(const MetaTensor& x,
   }
 }
 
+void SqueezeInferMeta(const MetaTensor& x,
+                      const std::vector<int>& axes,
+                      MetaTensor* xshape,
+                      MetaTensor* out) {
+  const auto& x_dims = x.dims();
+  // Check input tensor dims (<6) Eigen limit.
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    6,
+                    phi::errors::InvalidArgument(
+                        "The dimensions of Input(X) "
+                        "should be in the range of [1, 6] (Eigen limit)."
+                        "But received X's dimensions = %d, X's shape = [%s].",
+                        x_dims.size(),
+                        x_dims));
+
+  auto out_dims = funcs::GetOutputSqueezeShape(axes, x_dims, false);
+  out->set_dims(out_dims);
+  if (x_dims[0] == out_dims[0]) {
+    // Only pass LoD when the first dimension of output and Input(X)
+    // are the same.
+    out->share_lod(x);
+  }
+
+  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->share_lod(x);
+  xshape->set_dtype(x.dtype());
+  out->set_dtype(x.dtype());
+}
+
 /*  Why not use SumRawInferMeta directly?
     Because we need make InferMetaFunction's args follow the design of api.yaml
 */
@@ -1282,6 +1637,55 @@ void TileInferMeta(const MetaTensor& x,
   }
 }
 
+void TopKInferMeta(const MetaTensor& x,
+                   const Scalar& k_scalar,
+                   int axis,
+                   bool largest,
+                   bool sorted,
+                   MetaTensor* out,
+                   MetaTensor* indices,
+                   MetaConfig config) {
+  auto input_dims = x.dims();
+  const int& dim_size = input_dims.size();
+  PADDLE_ENFORCE_EQ(
+      (axis < dim_size) && (axis >= (-1 * dim_size)),
+      true,
+      phi::errors::InvalidArgument(
+          "the axis of topk must be [-%d, %d), but you set axis is %d",
+          dim_size,
+          dim_size,
+          axis));
+
+  if (axis < 0) axis += dim_size;
+
+  int k = k_scalar.to<int>();
+  if (k_scalar.FromTensor()) {
+    k = -1;
+  } else {
+    PADDLE_ENFORCE_EQ(k >= 1,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "the attribute of k in the topk must >= 1 or be a "
+                          "Tensor, but received %d .",
+                          k));
+  }
+
+  PADDLE_ENFORCE_GE(
+      input_dims.size(),
+      1,
+      phi::errors::InvalidArgument("input of topk must have >= 1d shape"));
+
+  phi::DDim dims = input_dims;
+
+  dims[axis] = k;
+  out->set_dims(dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+  indices->set_dims(dims);
+  indices->share_lod(x);
+  indices->set_dtype(DataType::INT64);
+}
+
 void TraceInferMeta(
     const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) {
   int dim1 = axis1;
@@ -1397,6 +1801,17 @@ void TransposeInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void TransposeGradInferMeta(const MetaTensor& x,
+                            const std::vector<int>& axis,
+                            MetaTensor* out) {
+  std::vector<int> reversed_axis(axis);
+  for (size_t i = 0; i < axis.size(); i++) {
+    reversed_axis[axis[i]] = i;
+  }
+
+  TransposeInferMeta(x, reversed_axis, out);
+}
+
 void UnbindInferMeta(const MetaTensor& x,
                      int axis,
                      std::vector<MetaTensor>* outs) {
@@ -1428,7 +1843,7 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x,
   PADDLE_ENFORCE_GE(
       axis,
       -rank,
-      errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "Attr(axis) value should be in range [-R, R-1], "
           "R is the rank of Input(X). But received axis: %d, R: %d.",
           axis,
@@ -1602,6 +2017,78 @@ void UnfoldInferMeta(const MetaTensor& x,
   out->set_dims(phi::make_ddim(out_dims));
 }
 
+void UnsqueezeInferMeta(const MetaTensor& x,
+                        const ScalarArray& axes,
+                        MetaTensor* xshape,
+                        MetaTensor* out) {
+  const auto& x_dims = x.dims();
+  // Validity Check: input tensor dims (<6).
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    6,
+                    phi::errors::InvalidArgument(
+                        "Invalid "
+                        "dimensions, the rank of Input(X) "
+                        "should be in the range of [1, 6] (Eigen limit)"));
+  if (!axes.GetData().empty()) {
+    std::vector<int32_t> tmp;
+    tmp.reserve(axes.GetData().size());
+    std::for_each(axes.GetData().begin(),
+                  axes.GetData().end(),
+                  [&tmp](const int64_t& t) { tmp.push_back(t); });
+    auto out_dims = funcs::GetUnsqueezeShape(tmp, x_dims);
+    out->set_dims(out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      out->share_lod(x);
+    }
+  }
+  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->share_lod(x);
+  out->set_dtype(x.dtype());
+  xshape->set_dtype(x.dtype());
+}
+
+void OneHotRawInferMeta(const MetaTensor& x,
+                        int32_t depth,
+                        DataType dtype,
+                        bool allow_out_of_range,
+                        MetaTensor* out) {
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      1,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
+  auto out_dims_vec = phi::vectorize(x_dims);
+  out_dims_vec.push_back(depth);
+  auto out_dims = phi::make_ddim(out_dims_vec);
+  out->set_dims(out_dims);
+  out->share_lod(x);
+  out->set_dtype(dtype);
+}
+
+void OneHotInferMeta(const MetaTensor& x,
+                     const Scalar& depth_t,
+                     MetaTensor* out) {
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      1,
+      phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
+
+  int depth = depth_t.to<int>();
+  auto out_dims_vec = phi::vectorize(x_dims);
+  out_dims_vec.push_back(depth);
+  auto out_dims = phi::make_ddim(out_dims_vec);
+  out->set_dims(out_dims);
+  out->share_lod(x);
+
+  out->set_dtype(phi::DataType::FLOAT32);
+}
+
 void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) {
   auto rank = condition.dims().size();
   PADDLE_ENFORCE_GE(
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index a1fc6fd4053d7..f623f14a709ad 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -31,6 +31,8 @@ class MetaConfig;
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 // Because functions in this file not only can infer shape, but also need
 // infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 
 void ArgMinMaxInferMeta(const MetaTensor& x,
                         int64_t axis,
@@ -72,6 +74,8 @@ void DiagInferMeta(const MetaTensor& x,
 void DiagonalInferMeta(
     const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out);
 
+void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask);
+
 void EighInferMeta(const MetaTensor& x,
                    const std::string& uplo,
                    MetaTensor* out_w,
@@ -87,6 +91,8 @@ void GumbelSoftmaxInferMeta(const MetaTensor& x,
                             bool hard,
                             int axis,
                             MetaTensor* out);
+void HistogramInferMeta(
+    const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out);
 
 void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out);
 
@@ -98,6 +104,16 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out);
 
 void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out);
 
+void KthvalueInferMeta(const MetaTensor& x,
+                       int k,
+                       int axis,
+                       bool keepdim,
+                       MetaTensor* out,
+                       MetaTensor* indices,
+                       MetaConfig = MetaConfig());
+
+void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out);
+
 void MaxPoolWithIndexInferMeta(const MetaTensor& x,
                                const std::vector<int>& kernel_size,
                                const std::vector<int>& strides,
@@ -108,10 +124,22 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
                                MetaTensor* mask,
                                MetaConfig config = MetaConfig());
 
+void ModeInferMeta(const MetaTensor& x,
+                   int axis,
+                   bool keepdim,
+                   MetaTensor* out,
+                   MetaTensor* indices);
+
 void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
                           MetaTensor* out);
+void NormInferMeta(const MetaTensor& x,
+                   int axis,
+                   float epsilon,
+                   bool is_test,
+                   MetaTensor* out,
+                   MetaTensor* norm);
 
 void PadInferMeta(const MetaTensor& input,
                   const std::vector<int>& paddings,
@@ -119,6 +147,14 @@ void PadInferMeta(const MetaTensor& input,
                   MetaTensor* out,
                   MetaConfig config = MetaConfig());
 
+void Pad3dInferMeta(const MetaTensor& x,
+                    const ScalarArray& paddings,
+                    const std::string& mode,
+                    float value,
+                    const std::string& data_format,
+                    MetaTensor* out,
+                    MetaConfig config = MetaConfig());
+
 void PixelShuffleInferMeta(const MetaTensor& x,
                            int upscale_factor,
                            const std::string& data_format,
@@ -162,6 +198,19 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
                                 MetaTensor* out,
                                 MetaConfig config = MetaConfig());
 
+void ReverseInferMeta(const MetaTensor& x,
+                      const std::vector<int>& axis,
+                      MetaTensor* out);
+
+void RollInferMeta(const MetaTensor& x,
+                   const ScalarArray& shifts,
+                   const std::vector<int64_t>& axis,
+                   MetaTensor* out);
+
+void SetValueInferMeta(const MetaTensor& x, MetaTensor* out);
+
+void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);
+
 void ShardIndexInferMeta(const MetaTensor& in,
                          int index_num,
                          int nshards,
@@ -180,6 +229,11 @@ void SplitInferMeta(const MetaTensor& x_meta,
                     std::vector<MetaTensor*> out,
                     MetaConfig config = MetaConfig());
 
+void SqueezeInferMeta(const MetaTensor& x,
+                      const std::vector<int>& axes,
+                      MetaTensor* xshape,
+                      MetaTensor* out);
+
 void SumInferMeta(const MetaTensor& x,
                   const std::vector<int64_t>& axis,
                   DataType dtype,
@@ -198,6 +252,15 @@ void TileInferMeta(const MetaTensor& x,
                    MetaTensor* out,
                    MetaConfig config = MetaConfig());
 
+void TopKInferMeta(const MetaTensor& x,
+                   const Scalar& k_scalar,
+                   int axis,
+                   bool largest,
+                   bool sorted,
+                   MetaTensor* out,
+                   MetaTensor* indices,
+                   MetaConfig config = MetaConfig());
+
 void TraceInferMeta(
     const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out);
 
@@ -209,6 +272,10 @@ void TransposeInferMeta(const MetaTensor& x,
                         const std::vector<int>& axis,
                         MetaTensor* out);
 
+void TransposeGradInferMeta(const MetaTensor& x,
+                            const std::vector<int>& axis,
+                            MetaTensor* out);
+
 void UnbindInferMeta(const MetaTensor& x,
                      int axis,
                      std::vector<MetaTensor>* outs);
@@ -228,6 +295,19 @@ void UnfoldInferMeta(const MetaTensor& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void UnsqueezeInferMeta(const MetaTensor& x,
+                        const ScalarArray& axes,
+                        MetaTensor* xshape,
+                        MetaTensor* out);
+
+void OneHotRawInferMeta(const MetaTensor& x,
+                        int32_t depth,
+                        DataType dtype,
+                        bool allow_out_of_range,
+                        MetaTensor* out);
+
+void OneHotInferMeta(const MetaTensor& x, const Scalar& depth, MetaTensor* out);
+
 void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index d443b7bb2a092..59540dbaefdd8 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -27,11 +27,18 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
 # Some kernels depend on some targets that are not commonly used.
 # These targets are not suitable for common dependencies.
 # In this case, you need to manually generate them here.
-set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel)
+set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel
+    hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
+    matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
+    put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
+    softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
+    triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel)
 kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
+kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code)
+kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code)
 kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel)
+kernel_library(reduce_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel)
 kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
@@ -46,6 +53,7 @@ kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
 kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel)
 kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce)
+kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
 
 # 4. auto parse and build kernel targets by cmake
 register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} )
@@ -54,3 +62,6 @@ register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS $
 add_subdirectory(sparse)
 
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
+
+# 5. kernel autotune 
+add_subdirectory(autotune)
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index f34e5710ab729..6ad28f348f22f 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -19,37 +19,160 @@ limitations under the License. */
 
 namespace phi {
 
-#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(name) \
   template <typename T, typename Context>         \
   void name##GradKernel(const Context& dev_ctx,   \
                         const DenseTensor& x,     \
                         const DenseTensor& dout,  \
                         DenseTensor* dx);
 
-#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(name, attr) \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx);
+
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(name, attr1, attr2) \
+  template <typename T, typename Context>                               \
+  void name##GradKernel(const Context& dev_ctx,                         \
+                        const DenseTensor& x,                           \
+                        const DenseTensor& dout,                        \
+                        float attr1,                                    \
+                        float attr2,                                    \
+                        DenseTensor* dx);
+
+#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(name) \
   template <typename T, typename Context>           \
   void name##GradKernel(const Context& dev_ctx,     \
                         const DenseTensor& out,     \
                         const DenseTensor& dout,    \
                         DenseTensor* dx);
 
+#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(name, attr) \
+  template <typename T, typename Context>                         \
+  void name##GradKernel(const Context& dev_ctx,                   \
+                        const DenseTensor& out,                   \
+                        const DenseTensor& dout,                  \
+                        float attr,                               \
+                        DenseTensor* dx);
+
+#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \
+  template <typename T, typename Context>                                 \
+  void name##GradKernel(const Context& dev_ctx,                           \
+                        const DenseTensor& out,                           \
+                        const DenseTensor& dout,                          \
+                        float attr1,                                      \
+                        float attr2,                                      \
+                        DenseTensor* dx);
+
 template <typename T, typename Context>
 void ReluDoubleGradKernel(const Context& dev_ctx,
                           const DenseTensor& out,
                           const DenseTensor& ddx,
                           DenseTensor* ddout);
 
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh);
-DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu);
+template <typename T, typename Context>
+void TanhDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          DenseTensor* dout_new,
+                          DenseTensor* ddout);
+
+template <typename T, typename Context>
+void TanhTripleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          const DenseTensor& d_ddout,
+                          const DenseTensor& d_dout_new,
+                          DenseTensor* d_out_new,
+                          DenseTensor* d_dout,
+                          DenseTensor* d_ddx);
+
+template <typename T, typename Context>
+void LeakyReluDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& ddx,
+                               float alpha,
+                               DenseTensor* ddout);
+
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx);
+
+template <typename T, typename Context>
+void EluDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         float alpha,
+                         DenseTensor* dx,
+                         DenseTensor* ddout);
+
+template <typename T, typename Context>
+void SigmoidDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             DenseTensor* dout_new,
+                             DenseTensor* ddout);
+
+template <typename T, typename Context>
+void SigmoidTripleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             const DenseTensor& d_ddout,
+                             const DenseTensor& d_dout_new,
+                             DenseTensor* d_out_new,
+                             DenseTensor* d_dout,
+                             DenseTensor* d_ddx);
+
+template <typename T, typename Context>
+void LogDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         DenseTensor* dx,
+                         DenseTensor* ddout);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Tan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acos);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asin);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atan);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asinh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acosh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log2);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log10);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p);
+
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max);
+
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index bdf8f4363598f..785d1089f06e8 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -24,6 +24,21 @@ namespace phi {
   void name##Kernel(                      \
       const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
+#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(name, attr) \
+  template <typename T, typename Context>                    \
+  void name##Kernel(const Context& dev_ctx,                  \
+                    const DenseTensor& x,                    \
+                    float attr,                              \
+                    DenseTensor* out);
+
+#define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \
+  template <typename T, typename Context>                            \
+  void name##Kernel(const Context& dev_ctx,                          \
+                    const DenseTensor& x,                            \
+                    float attr1,                                     \
+                    float attr2,                                     \
+                    DenseTensor* out);
+
 DECLARE_ACTIVATION_KERNEL(Cos)
 DECLARE_ACTIVATION_KERNEL(Tan)
 DECLARE_ACTIVATION_KERNEL(Acos)
@@ -36,5 +51,22 @@ DECLARE_ACTIVATION_KERNEL(Asinh)
 DECLARE_ACTIVATION_KERNEL(Acosh)
 DECLARE_ACTIVATION_KERNEL(Atanh)
 DECLARE_ACTIVATION_KERNEL(Relu)
+DECLARE_ACTIVATION_KERNEL(Tanh)
+DECLARE_ACTIVATION_KERNEL(TanhShrink)
+DECLARE_ACTIVATION_KERNEL(Silu)
+DECLARE_ACTIVATION_KERNEL(Sigmoid)
+DECLARE_ACTIVATION_KERNEL(LogSigmoid)
+DECLARE_ACTIVATION_KERNEL(Log)
+DECLARE_ACTIVATION_KERNEL(Log2)
+DECLARE_ACTIVATION_KERNEL(Log10)
+DECLARE_ACTIVATION_KERNEL(Log1p)
+
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)
 }  // namespace phi
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
new file mode 100644
index 0000000000000..9faaace691766
--- /dev/null
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/assign_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  paddle::optional<const DenseTensor&> x,
+                  DenseTensor* out) {
+  if (!x.is_initialized()) {
+    return;
+  }
+  auto& x_tensor = *x.get_ptr();
+  Copy<Context>(dev_ctx, x_tensor, x_tensor.place(), false, out);
+}
+
+// Note: use `const paddle::optional<std::vector<const DenseTensor*>&> x`
+// as input if needed
+template <typename Context>
+void AssignArrayKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       std::vector<DenseTensor*> out) {
+  for (size_t i = 0; i < x.size(); ++i) {
+    AssignKernel<Context>(dev_ctx, *x[i], out.at(i));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(
+    assign, CPU, ALL_LAYOUT, phi::AssignKernel<phi::CPUContext>, ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(assign_array,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::AssignArrayKernel<phi::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    assign, GPU, ALL_LAYOUT, phi::AssignKernel<phi::GPUContext>, ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(assign_array,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::AssignArrayKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h
new file mode 100644
index 0000000000000..7cc06818dc007
--- /dev/null
+++ b/paddle/phi/kernels/assign_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+// In order to be compatible with the `AsDispensable` input in the original
+// assign op maker, the input parameter here needs to be dispensable, but
+// this looks weird
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  paddle::optional<const DenseTensor&> x,
+                  DenseTensor* out);
+
+template <typename Context>
+void AssignArrayKernel(const Context& dev_ctx,
+                       const std::vector<const DenseTensor*>& x,
+                       std::vector<DenseTensor*> out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt
new file mode 100644
index 0000000000000..c7bb30d2d767c
--- /dev/null
+++ b/paddle/phi/kernels/autotune/CMakeLists.txt
@@ -0,0 +1,5 @@
+if (WITH_GPU)
+     nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
+elseif (WITH_ROCM)
+    hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
+endif()
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
new file mode 100644
index 0000000000000..87eca2613a7b5
--- /dev/null
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+namespace phi {
+
+class GpuTimer {
+ public:
+  GpuTimer() {
+#ifdef PADDLE_WITH_HIP
+    hipEventCreate(&start_);
+    hipEventCreate(&stop_);
+#else
+    cudaEventCreate(&start_);
+    cudaEventCreate(&stop_);
+#endif
+    PADDLE_ENFORCE_NOT_NULL(
+        start_, phi::errors::PreconditionNotMet("Start Event is not ready."));
+    PADDLE_ENFORCE_NOT_NULL(
+        stop_, phi::errors::PreconditionNotMet("Stop Event is not ready."));
+  }
+
+  ~GpuTimer() {
+#ifdef PADDLE_WITH_HIP
+    hipEventDestroy(start_);
+    hipEventDestroy(stop_);
+#else
+    cudaEventDestroy(start_);
+    cudaEventDestroy(stop_);
+#endif
+  }
+
+  void Start(gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+    hipEventRecord(start_, stream);
+#else
+    cudaEventRecord(start_, stream);
+#endif
+  }
+
+  void Stop(gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+    hipEventRecord(stop_, stream);
+#else
+    cudaEventRecord(stop_, stream);
+#endif
+  }
+
+  float ElapsedTime() {
+    float milliseconds = 0;
+#ifdef PADDLE_WITH_HIP
+    hipEventSynchronize(stop_);
+    hipEventElapsedTime(&milliseconds, start_, stop_);
+#else
+    cudaEventSynchronize(stop_);
+    cudaEventElapsedTime(&milliseconds, start_, stop_);
+#endif
+    return milliseconds;
+  }
+
+ private:
+  gpuEvent_t start_;
+  gpuEvent_t stop_;
+};
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/autotune/gpu_timer_test.cu b/paddle/phi/kernels/autotune/gpu_timer_test.cu
new file mode 100644
index 0000000000000..b6eb345885f30
--- /dev/null
+++ b/paddle/phi/kernels/autotune/gpu_timer_test.cu
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <functional>
+#include "glog/logging.h"
+#include "paddle/phi/kernels/autotune/gpu_timer.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+template <typename T, int VecSize>
+__global__ void VecSum(T *x, T *y, int N) {
+#ifdef __HIPCC__
+  int idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+#else
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+#endif
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  for (int i = idx * VecSize; i < N; i += blockDim.x * gridDim.x * VecSize) {
+    LoadT x_vec;
+    LoadT y_vec;
+    phi::Load<T, VecSize>(&x[i], &x_vec);
+    phi::Load<T, VecSize>(&y[i], &y_vec);
+#pragma unroll
+    for (int j = 0; j < VecSize; j++) {
+      y_vec[j] = x_vec[j] + y_vec[j];
+    }
+    phi::Store<T, VecSize>(y_vec, &y[i]);
+  }
+}
+
+template <int Vecsize, int Threads, size_t Blocks>
+void Algo(float *d_in, float *d_out, size_t N) {
+#ifdef __HIPCC__
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(VecSum<float, Vecsize>),
+                     dim3(Blocks),
+                     dim3(Threads),
+                     0,
+                     0,
+                     d_in,
+                     d_out,
+                     N);
+#else
+  VecSum<float, Vecsize><<<Blocks, Threads>>>(d_in, d_out, N);
+#endif
+}
+
+TEST(GpuTimer, Sum) {
+  float *in1, *in2, *out;
+  float *d_in1, *d_in2;
+  size_t N = 1 << 20;
+  size_t size = sizeof(float) * N;
+#ifdef __HIPCC__
+  hipMalloc(reinterpret_cast<void **>(&d_in1), size);
+  hipMalloc(reinterpret_cast<void **>(&d_in2), size);
+#else
+  cudaMalloc(reinterpret_cast<void **>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void **>(&d_in2), size);
+#endif
+  in1 = reinterpret_cast<float *>(malloc(size));
+  in2 = reinterpret_cast<float *>(malloc(size));
+  out = reinterpret_cast<float *>(malloc(size));
+  for (size_t i = 0; i < N; i++) {
+    in1[i] = 1.0f;
+    in2[i] = 2.0f;
+  }
+
+#ifdef __HIPCC__
+  hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
+  hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);
+#else
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
+#endif
+
+  using Functor = std::function<void(float *, float *, size_t)>;
+  Functor alog0 = Algo<4, 256, 1024>;
+  Functor algo1 = Algo<1, 256, 1024>;
+  Functor alog2 = Algo<1, 256, 8>;
+
+  std::vector<Functor> algos = {alog0, algo1, alog2};
+
+  for (int j = 0; j < algos.size(); ++j) {
+    auto algo = algos[j];
+    phi::GpuTimer timer;
+    timer.Start(0);
+    algo(d_in1, d_in2, N);
+    timer.Stop(0);
+    VLOG(3) << "alog: " << j << " cost: " << timer.ElapsedTime() << "ms";
+  }
+
+#ifdef __HIPCC__
+  hipMemcpy(out, d_in2, size, hipMemcpyDeviceToHost);
+#else
+  cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
+#endif
+  free(in1);
+  free(in2);
+  free(out);
+#ifdef __HIPCC__
+  hipFree(d_in1);
+  hipFree(d_in2);
+#else
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+#endif
+}
diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc
new file mode 100644
index 0000000000000..a0de7842b9e0d
--- /dev/null
+++ b/paddle/phi/kernels/batch_norm_kernel.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BatchNormInferKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& scale,
+                          const DenseTensor& bias,
+                          const DenseTensor& mean,
+                          const DenseTensor& variance,
+                          float momentum,
+                          float epsilon,
+                          const std::string& data_layout,
+                          DenseTensor* y,
+                          DenseTensor* mean_out,
+                          DenseTensor* variance_out) {
+  // Since saved_mean and saved_variance are used regardless of whether
+  // they are in test mode, temporary variables need to be created here
+  // to be compatible
+  auto saved_mean = phi::EmptyLike<T, Context>(dev_ctx, *mean_out);
+  auto saved_variance = phi::EmptyLike<T, Context>(dev_ctx, *variance_out);
+  BatchNormKernel<T, Context>(dev_ctx,
+                              x,
+                              scale,
+                              bias,
+                              mean,
+                              variance,
+                              momentum,
+                              epsilon,
+                              data_layout,
+                              /*is_test=*/true,
+                              /*use_global_stats=*/false,
+                              /*trainable_statistics=*/false,
+                              /*fuse_with_relu=*/false,
+                              y,
+                              mean_out,
+                              variance_out,
+                              &saved_mean,
+                              &saved_variance,
+                              /*reserve_space=*/nullptr);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(batch_norm_infer,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormInferKernel,
+                   float,
+                   double) {}
+#ifdef PADDLE_WITH_CUDA
+PD_REGISTER_KERNEL(batch_norm_infer,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormInferKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#endif
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_infer,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormInferKernel,
+                   float,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/batch_norm_kernel.h b/paddle/phi/kernels/batch_norm_kernel.h
index 7ddf32e27c7d7..be589e43647c1 100644
--- a/paddle/phi/kernels/batch_norm_kernel.h
+++ b/paddle/phi/kernels/batch_norm_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
@@ -40,4 +41,18 @@ void BatchNormKernel(const Context& dev_ctx,
                      DenseTensor* saved_variance,
                      DenseTensor* reserve_space);
 
+template <typename T, typename Context>
+void BatchNormInferKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& scale,
+                          const DenseTensor& bias,
+                          const DenseTensor& mean,
+                          const DenseTensor& variance,
+                          float momentum,
+                          float epsilon,
+                          const std::string& data_layout,
+                          DenseTensor* y,
+                          DenseTensor* mean_out,
+                          DenseTensor* variance_out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/conv_kernel.cc b/paddle/phi/kernels/conv_kernel.cc
new file mode 100644
index 0000000000000..7268384f401a1
--- /dev/null
+++ b/paddle/phi/kernels/conv_kernel.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvInferKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& filter,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::string& paddding_algorithm,
+                     int groups,
+                     const std::vector<int>& dilations,
+                     const std::string& data_format,
+                     DenseTensor* out) {
+  ConvKernel<T, Context>(dev_ctx,
+                         input,
+                         filter,
+                         strides,
+                         paddings,
+                         paddding_algorithm,
+                         groups,
+                         dilations,
+                         data_format,
+                         /*use_addto=*/false,
+                         /*workspace_size_MB=*/paddle::platform::
+                             GetDefaultConvWorkspaceSizeLimitMB(),
+                         /*exhaustive_search=*/false,
+                         out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    conv2d_infer, CPU, ALL_LAYOUT, phi::ConvInferKernel, float, double) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(
+    conv2d_infer, GPU, ALL_LAYOUT, phi::ConvInferKernel, float, double) {}
+#endif
diff --git a/paddle/phi/kernels/conv_kernel.h b/paddle/phi/kernels/conv_kernel.h
index eb0bfdd0275b5..508b3a42a21ad 100644
--- a/paddle/phi/kernels/conv_kernel.h
+++ b/paddle/phi/kernels/conv_kernel.h
@@ -64,4 +64,16 @@ void DepthwiseConvKernel(const Context& dev_ctx,
                          bool fuse_relu,
                          DenseTensor* out);
 
+template <typename T, typename Context>
+void ConvInferKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& filter,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::string& paddding_algorithm,
+                     int groups,
+                     const std::vector<int>& dilations,
+                     const std::string& data_format,
+                     DenseTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/conv_transpose_grad_kernel.h b/paddle/phi/kernels/conv_transpose_grad_kernel.h
new file mode 100644
index 0000000000000..2b1c0c1a934cf
--- /dev/null
+++ b/paddle/phi/kernels/conv_transpose_grad_kernel.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv2dTransposeGradKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& filter,
+                               const DenseTensor& dout,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& output_padding,
+                               const std::vector<int>& output_size,
+                               const std::string& padding_algorithm,
+                               int groups,
+                               const std::vector<int>& dilations,
+                               const std::string& data_format,
+                               DenseTensor* dx,
+                               DenseTensor* dfilter);
+
+template <typename T, typename Context>
+void Conv2dTransposeDoubleGradKernel(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const DenseTensor& ddx,
+                                     const DenseTensor& ddfilter,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings,
+                                     const std::vector<int>& output_padding,
+                                     const std::vector<int>& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter,
+                                     DenseTensor* ddout);
+
+template <typename T, typename Context>
+void Conv3dTransposeGradKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& filter,
+                               const DenseTensor& dout,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& output_padding,
+                               const std::vector<int>& output_size,
+                               const std::string& padding_algorithm,
+                               int groups,
+                               const std::vector<int>& dilations,
+                               const std::string& data_format,
+                               DenseTensor* dx,
+                               DenseTensor* dfilter);
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
+                                        const DenseTensor& x,
+                                        const DenseTensor& filter,
+                                        const DenseTensor& dout,
+                                        const std::vector<int>& strides,
+                                        const std::vector<int>& paddings,
+                                        const std::vector<int>& output_padding,
+                                        const std::vector<int>& output_size,
+                                        const std::string& padding_algorithm,
+                                        int groups,
+                                        const std::vector<int>& dilations,
+                                        const std::string& data_format,
+                                        DenseTensor* dx,
+                                        DenseTensor* dfilter);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/conv_transpose_kernel.h b/paddle/phi/kernels/conv_transpose_kernel.h
new file mode 100644
index 0000000000000..de56f13ddf73e
--- /dev/null
+++ b/paddle/phi/kernels/conv_transpose_kernel.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv2dTransposeKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::vector<int>& output_padding,
+                           const std::vector<int>& output_size,
+                           const std::string& padding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           DenseTensor* out);
+
+template <typename T, typename Context>
+void Conv3dTransposeKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::vector<int>& output_padding,
+                           const std::vector<int>& output_size,
+                           const std::string& padding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           DenseTensor* out);
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeKernel(const Context& ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& filter,
+                                    const std::vector<int>& strides,
+                                    const std::vector<int>& paddings,
+                                    const std::vector<int>& output_padding,
+                                    const std::vector<int>& output_size,
+                                    const std::string& padding_algorithm,
+                                    int groups,
+                                    const std::vector<int>& dilations,
+                                    const std::string& data_format,
+                                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index fe43ebb816077..0776e570e9cd3 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -21,71 +21,224 @@ limitations under the License. */
 
 namespace phi {
 
-#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
   template <typename T, typename Context>                           \
   void name##GradKernel(const Context& dev_ctx,                     \
                         const DenseTensor& x,                       \
                         const DenseTensor& dout,                    \
                         DenseTensor* dx) {                          \
-    functor_class functor;                                          \
-    ActivationGradImpl<T, Context, functor_class>(                  \
+    funcs::functor_class<T> functor;                                \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>(        \
         dev_ctx, &x, nullptr, &dout, dx, functor);                  \
   }
 
-#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(      \
+    name, functor_class, attr)                               \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& x,                \
+                        const DenseTensor& dout,             \
+                        float attr,                          \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr;                               \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);           \
+  }
+
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(      \
+    name, functor_class, attr1, attr2)                       \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& x,                \
+                        const DenseTensor& dout,             \
+                        float attr1,                         \
+                        float attr2,                         \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr1;                              \
+    *(attrs[1].second) = attr2;                              \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);           \
+  }
+
+#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
   template <typename T, typename Context>                             \
   void name##GradKernel(const Context& dev_ctx,                       \
                         const DenseTensor& out,                       \
                         const DenseTensor& dout,                      \
                         DenseTensor* dx) {                            \
-    functor_class functor;                                            \
-    ActivationGradImpl<T, Context, functor_class>(                    \
+    funcs::functor_class<T> functor;                                  \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>(          \
         dev_ctx, nullptr, &out, &dout, dx, functor);                  \
   }
 
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor<T>);
-DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor<T>);
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(    \
+    name, functor_class, attr)                               \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& out,              \
+                        const DenseTensor& dout,             \
+                        float attr,                          \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr;                               \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);         \
+  }
+
+#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(    \
+    name, functor_class, attr1, attr2)                       \
+  template <typename T, typename Context>                    \
+  void name##GradKernel(const Context& dev_ctx,              \
+                        const DenseTensor& out,              \
+                        const DenseTensor& dout,             \
+                        float attr1,                         \
+                        float attr2,                         \
+                        DenseTensor* dx) {                   \
+    funcs::functor_class<T> functor;                         \
+    auto attrs = functor.GetAttrs();                         \
+    *(attrs[0].second) = attr1;                              \
+    *(attrs[1].second) = attr2;                              \
+    ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);         \
+  }
+
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, SinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, AsinGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, AtanGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, SinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, AsinhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, Log10GradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, Log1pGradFunctor);
+
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, SigmoidGradFunctor);
+
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
+                                               LeakyReluGradFunctor,
+                                               alpha);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
+                                               ThresholdedReluGradFunctor,
+                                               threshold);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
+                                               SoftShrinkGradFunctor,
+                                               lambda);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
+                                               HardShrinkGradFunctor,
+                                               threshold);
+
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
+                                               BReluGradFunctor,
+                                               t_min,
+                                               t_max);
+
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
+                                                 HardSigmoidGradFunctor,
+                                                 slope,
+                                                 offset);
+
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+
+  auto x_flatten =
+      EigenVector<T>::Flatten(GET_DATA_SAFELY(&x, "Input", "X", "elu_grad"));
+  auto out_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&out, "Input", "Out", "elu_grad"));
+  auto dout_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&dout, "Input", "dOut", "elu_grad"));
+  auto dx_flatten =
+      EigenVector<T>::Flatten(GET_DATA_SAFELY(dx, "Output", "dX", "elu_grad"));
+  auto* place = dev_ctx.eigen_device();
+
+  if (alpha > 0) {
+    funcs::ELUGradFunctor<T> functor;
+    functor.alpha = alpha;
+    functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
+  } else {
+    funcs::ELUGradNegativeAlphaFunctor<T> functor;
+    functor.alpha = alpha;
+    functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten);
+  }
+}
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {}
-PD_REGISTER_KERNEL(
-    atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {}
 PD_REGISTER_KERNEL(
     relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {}
-PD_REGISTER_KERNEL(relu_double_grad,
+
+#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
+
+#define PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(                                         \
+      name, CPU, ALL_LAYOUT, phi::func, float, double, phi::dtype::float16) {}
+
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
+                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
+
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad,
+                                          ReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad,
+                                          TanhDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad,
+                                          LeakyReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
+
+PD_REGISTER_KERNEL(tanh_triple_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ReluDoubleGradKernel,
+                   phi::TanhTripleGradKernel,
                    float,
                    double,
                    phi::dtype::float16) {}
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log_grad, LogGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log2_grad, Log2GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log10_grad, Log10GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log1p_grad, Log1pGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(log_double_grad, LogDoubleGradKernel)
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 51883f25183af..c8709261d2cb0 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -19,37 +19,110 @@ limitations under the License. */
 
 namespace phi {
 
-#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)                \
-  template <typename T, typename Context>                                \
-  void name##Kernel(                                                     \
-      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {  \
-    functor_class functor;                                               \
-    ActivationImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class)               \
+  template <typename T, typename Context>                               \
+  void name##Kernel(                                                    \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
+    funcs::functor_class<T> functor;                                    \
+    ActivationImpl<T, Context, funcs::functor_class<T>>(                \
+        dev_ctx, x, out, functor);                                      \
   }
 
-DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor<T>)
-DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor<T>)
+#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
+  template <typename T, typename Context>                               \
+  void name##Kernel(const Context& dev_ctx,                             \
+                    const DenseTensor& x,                               \
+                    float attr,                                         \
+                    DenseTensor* out) {                                 \
+    funcs::functor_class<T> functor;                                    \
+    auto attrs = functor.GetAttrs();                                    \
+    *(attrs[0].second) = attr;                                          \
+    ActivationImpl<T, Context, funcs::functor_class<T>>(                \
+        dev_ctx, x, out, functor);                                      \
+  }
+
+#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(            \
+    name, functor_class, attr1, attr2)                   \
+  template <typename T, typename Context>                \
+  void name##Kernel(const Context& dev_ctx,              \
+                    const DenseTensor& x,                \
+                    float attr1,                         \
+                    float attr2,                         \
+                    DenseTensor* out) {                  \
+    funcs::functor_class<T> functor;                     \
+    auto attrs = functor.GetAttrs();                     \
+    *(attrs[0].second) = attr1;                          \
+    *(attrs[1].second) = attr2;                          \
+    ActivationImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                       \
+  }
+
+DEFINE_CPU_ACTIVATION_KERNEL(Sin, SinFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Cos, CosFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Tan, TanFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Asin, AsinFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Atan, AtanFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Acos, AcosFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Sinh, SinhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Cosh, CoshFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Asinh, AsinhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Acosh, AcoshFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Atanh, AtanhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log, LogFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log2, Log2Functor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log10, Log10Functor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log1p, Log1pFunctor)
+
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
+                                     ThresholdedReluFunctor,
+                                     threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
+
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
+                                     HardSigmoidFunctor,
+                                     slope,
+                                     offset)
 
 }  // namespace phi
-PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {}
-PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {}
-PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {}
-PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {}
-PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {}
-PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {}
-PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {}
-PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {}
-PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {}
-PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {}
-PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {}
 PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
+
+#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {}
+
+PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
diff --git a/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc
new file mode 100644
index 0000000000000..8d0749500695c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_transpose_grad_kernel.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
+                                        const DenseTensor& x,
+                                        const DenseTensor& filter,
+                                        const DenseTensor& dout,
+                                        const std::vector<int>& strides,
+                                        const std::vector<int>& paddings,
+                                        const std::vector<int>& output_padding,
+                                        const std::vector<int>& output_size,
+                                        const std::string& padding_algorithm,
+                                        int groups,
+                                        const std::vector<int>& dilations,
+                                        const std::string& data_format,
+                                        DenseTensor* dx,
+                                        DenseTensor* dfilter) {
+  ConvTransposeGradRawKernel<T, Context>(ctx,
+                                         x,
+                                         filter,
+                                         dout,
+                                         strides,
+                                         paddings,
+                                         padding_algorithm,
+                                         groups,
+                                         dilations,
+                                         data_format,
+                                         dx,
+                                         dfilter);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d_transpose_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(conv3d_transpose_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_transpose_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConv2dTransposeGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/conv_transpose_kernel.cc b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc
new file mode 100644
index 0000000000000..b4cacc850938e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/conv_transpose_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
+#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeKernel(const Context& ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& filter,
+                                    const std::vector<int>& strides,
+                                    const std::vector<int>& paddings,
+                                    const std::vector<int>& output_padding,
+                                    const std::vector<int>& output_size,
+                                    const std::string& padding_algorithm,
+                                    int groups,
+                                    const std::vector<int>& dilations,
+                                    const std::string& data_format,
+                                    DenseTensor* out) {
+  ConvTransposeRawKernel<T, Context>(ctx,
+                                     x,
+                                     filter,
+                                     strides,
+                                     paddings,
+                                     padding_algorithm,
+                                     groups,
+                                     dilations,
+                                     data_format,
+                                     out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d_transpose,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(conv3d_transpose,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_transpose,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConv2dTransposeKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc
index 1af071f23ddc5..fa11fd05bf1d6 100644
--- a/paddle/phi/kernels/cpu/copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/copy_kernel.cc
@@ -38,7 +38,7 @@ void Copy(const Context& dev_ctx,
           << src_place;
 
   dst->Resize(src.dims());
-  auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  auto* dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
 
   if (src_ptr == dst_ptr) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
diff --git a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
new file mode 100644
index 0000000000000..a25f9650fc50f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+// NOTE(@xiongkun): use of IsComplex<>
+#include "paddle/fluid/framework/data_type.h"
+
+namespace phi {
+template <typename T, typename Context>
+void CumprodGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& out,
+                       const DenseTensor& d_out,
+                       int dim,
+                       DenseTensor* d_x) {
+  DDim shape = x.dims();
+
+  auto* d_out_data = d_out.data<T>();
+  auto* x_data = x.data<T>();
+  auto* out_data = out.data<T>();
+  auto* d_x_data = dev_ctx.template Alloc<T>(d_x);
+
+  size_t outer_dim = 1;
+  size_t mid_dim = 1;
+  size_t inner_dim = 1;
+  GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
+  size_t numel = outer_dim * mid_dim * inner_dim;
+
+  // deal with complex
+  const T* x_data_deal;
+  const T* out_data_deal;
+  Allocator::AllocationPtr x_conj;
+  Allocator::AllocationPtr out_conj;
+  if (paddle::framework::IsComplex<T>::value) {
+    x_conj = const_cast<Allocator&>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto* x_data_conj = reinterpret_cast<T*>(x_conj->ptr());
+    out_conj = const_cast<Allocator&>(dev_ctx.GetAllocator())
+                   .Allocate(numel * sizeof(T));
+    auto* out_data_conj = reinterpret_cast<T*>(out_conj->ptr());
+
+    phi::funcs::ForRange<Context> for_range_x(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
+    for_range_x(functor_x);
+
+    phi::funcs::ForRange<Context> for_range_out(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_out(out_data, numel, out_data_conj);
+    for_range_out(functor_out);
+
+    x_data_deal = x_data_conj;
+    out_data_deal = out_data_conj;
+  } else {
+    x_data_deal = x_data;
+    out_data_deal = out_data;
+  }
+
+  for (size_t i = 0; i < outer_dim; i++) {
+    for (size_t k = 0; k < inner_dim; k++) {
+      for (size_t j = 0; j < mid_dim; j++) {
+        size_t index = i * mid_dim * inner_dim + j * inner_dim + k;
+        d_x_data[index] = 0;
+        for (size_t n = 0; n < mid_dim; n++) {
+          size_t pos = i * mid_dim * inner_dim + n * inner_dim + k;
+          T elem;
+          if (j == 0) {
+            elem = d_out_data[pos];
+          } else {
+            elem = d_out_data[pos] * out_data_deal[index - inner_dim];
+          }
+          if (pos > index) {
+            for (size_t m = index + inner_dim; m <= pos; m += inner_dim) {
+              elem *= x_data_deal[m];
+            }
+          } else if (pos < index) {
+            elem = static_cast<T>(0);
+          }
+          d_x_data[index] += elem;
+        }
+      }
+    }
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(cumprod_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumprodGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc
new file mode 100644
index 0000000000000..aea338027f5bb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_kernel.h"
+
+#include <cstdint>
+#include <type_traits>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+
+namespace phi {
+template <typename T, typename Context>
+void CumprodKernel(const Context& dev_ctx,
+                   const DenseTensor& input,
+                   int dim,
+                   DenseTensor* out) {
+  const DenseTensor* x = &input;
+  auto* x_data = x->data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  DDim shape = x->dims();
+
+  size_t outer_dim = 1;
+  size_t mid_dim = 1;
+  size_t inner_dim = 1;
+  GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim);
+
+  for (size_t i = 0; i < outer_dim; i++) {
+    for (size_t j = 0; j < mid_dim; j++) {
+      for (size_t k = 0; k < inner_dim; k++) {
+        size_t pos = i * mid_dim * inner_dim + j * inner_dim + k;
+        if (j == 0) {
+          out_data[pos] = x_data[pos];
+        } else {
+          out_data[pos] = out_data[pos - inner_dim] * x_data[pos];
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumprodKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/deformable_conv_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc
new file mode 100644
index 0000000000000..0d61f7be68af9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/deformable_conv_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+inline void ModulatedDeformableIm2colCPUKernel(
+    const int num_kernels,
+    const T* data_im,
+    const T* data_offset,
+    const T* data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    T* data_col) {
+  for (int i = 0; i < num_kernels; i++) {
+    const int w_col = i % width_col;
+    const int h_col = (i / width_col) % height_col;
+    const int b_col = (i / width_col) / height_col % batch_size;
+    const int c_im = (i / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T* data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T* data_offset_ptr =
+        data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const T* data_mask_ptr =
+        data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          val =
+              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ModulatedDeformableIm2col(const Context& dev_ctx,
+                               const T* data_im,
+                               const T* data_offset,
+                               const T* data_mask,
+                               const std::vector<int64_t>& im_shape,
+                               const std::vector<int64_t>& col_shape,
+                               const std::vector<int64_t>& filter_shape,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& dilations,
+                               const int deformable_groups,
+                               T* data_col) {
+  int channel_per_deformable_group = im_shape[0] / deformable_groups;
+  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
+
+  // get outputs of im2col with offset by bilinear interpolation
+  ModulatedDeformableIm2colCPUKernel(num_kernels,
+                                     data_im,
+                                     data_offset,
+                                     data_mask,
+                                     im_shape[1],
+                                     im_shape[2],
+                                     filter_shape[2],
+                                     filter_shape[3],
+                                     paddings[0],
+                                     paddings[1],
+                                     strides[0],
+                                     strides[1],
+                                     dilations[0],
+                                     dilations[1],
+                                     channel_per_deformable_group,
+                                     col_shape[1],
+                                     im_shape[0],
+                                     deformable_groups,
+                                     col_shape[2],
+                                     col_shape[3],
+                                     data_col);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(deformable_conv,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DeformableConvKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/determinant_grad_kernel.cc b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc
new file mode 100644
index 0000000000000..e57d7263f88bf
--- /dev/null
+++ b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(determinant_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DeterminantGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/determinant_kernel.cc b/paddle/phi/kernels/cpu/determinant_kernel.cc
new file mode 100644
index 0000000000000..5810e88e92527
--- /dev/null
+++ b/paddle/phi/kernels/cpu/determinant_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/determinant_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    determinant, CPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 37ad18df56ec3..095d11720ce26 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -12,10 +12,81 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
+namespace phi {
+
+#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
+  template <typename T, typename Context>                                   \
+  void name##RawKernel(const Context& dev_ctx,                              \
+                       const DenseTensor& x,                                \
+                       const DenseTensor& y,                                \
+                       int axis,                                            \
+                       DenseTensor* out) {                                  \
+    dev_ctx.template Alloc<T>(out);                                         \
+    if (x.dims() == y.dims()) {                                             \
+      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
+          dev_ctx, x, y, out);                                              \
+    } else {                                                                \
+      auto x_dims = x.dims();                                               \
+      auto y_dims = y.dims();                                               \
+      if (x_dims.size() >= y_dims.size()) {                                 \
+        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
+            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
+      } else {                                                              \
+        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
+            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
+      }                                                                     \
+    }                                                                       \
+  }
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
+    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
+        dev_ctx, x, y, out);
+  } else {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    if (x_dims.size() >= y_dims.size()) {
+      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
+    } else {
+      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
+    }
+  }
+}
+
+// Create the definition of Add
+DEFINE_CPU_ELEMENTWISE_OP(Add)
+
+// Create the definition of Subtract
+DEFINE_CPU_ELEMENTWISE_OP(Subtract)
+
+// Create the definition of Multiply
+DEFINE_CPU_ELEMENTWISE_OP(Multiply)
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
 PD_REGISTER_KERNEL(elementwise_fmax,
                    CPU,
                    ALL_LAYOUT,
@@ -33,3 +104,49 @@ PD_REGISTER_KERNEL(elementwise_fmin,
                    double,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(add_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(subtract_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(divide_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(multiply_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc
new file mode 100644
index 0000000000000..21b3e6da8d9ef
--- /dev/null
+++ b/paddle/phi/kernels/cpu/embedding_grad_kernel.cc
@@ -0,0 +1,220 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/embedding_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+struct EmbeddingGradCPUFunctor {
+  EmbeddingGradCPUFunctor(const Context& dev_ctx,
+                          const DenseTensor& input,
+                          const DenseTensor& weight,
+                          const DenseTensor& out_grad,
+                          int64_t padding_idx,
+                          DenseTensor* weight_grad)
+      : dev_ctx_(dev_ctx),
+        input_(input),
+        weight_(weight),
+        out_grad_(out_grad),
+        weight_grad_(weight_grad),
+        padding_idx_(padding_idx) {}
+
+  template <typename IdT>
+  void apply() {
+    DDim table_dim = weight_.dims();
+
+    auto ids = CopyIdsToVector<IdT, int64_t>(input_);
+    auto ids_num = static_cast<int64_t>(ids.size());
+
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+    {
+      auto* d_output = &out_grad_;
+      auto* ids_data = ids.data();
+
+      int64_t N = table_dim[0];
+      int64_t D = table_dim[1];
+
+      auto* d_output_data = d_output->template data<T>();
+
+      dev_ctx_.template Alloc<T>(weight_grad_);
+      auto* d_table_data = weight_grad_->data<T>();
+
+      memset(d_table_data, 0, weight_grad_->numel() * sizeof(T));
+
+      for (int64_t i = 0; i < ids_num; ++i) {
+        if (padding_idx_ != kNoPadding && ids_data[i] == padding_idx_) {
+          // the gradient of padding_idx should be 0, already done by memset, so
+          // do nothing.
+        } else {
+          PADDLE_ENFORCE_LT(
+              ids_data[i],
+              N,
+              phi::errors::InvalidArgument(
+                  "Variable value (input) of "
+                  "OP(paddle.nn.functional.embedding) "
+                  "expected >= 0 and < %ld, but got %ld. Please check input "
+                  "value.",
+                  N,
+                  ids_data[i]));
+          PADDLE_ENFORCE_GE(
+              ids_data[i],
+              0,
+              phi::errors::InvalidArgument(
+                  "Variable value (input) of "
+                  "OP(paddle.nn.functional.embedding) "
+                  "expected >= 0 and < %ld, but got %ld. Please check input "
+                  "value.",
+                  N,
+                  ids_data[i]));
+          for (int j = 0; j < D; ++j) {
+            d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  const Context& dev_ctx_;
+  const DenseTensor& input_;
+  const DenseTensor& weight_;
+  const DenseTensor& out_grad_;
+  DenseTensor* weight_grad_;
+  int64_t padding_idx_;
+};
+
+template <typename T, typename Context>
+void EmbeddingGradKernel(const Context& ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& weight,
+                         const DenseTensor& out_grad,
+                         int64_t padding_idx,
+                         DenseTensor* weight_grad) {
+  EmbeddingGradCPUFunctor<T, Context> functor(
+      ctx, input, weight, out_grad, padding_idx, weight_grad);
+  if (input.dtype() == phi::DataType::INT32) {
+    functor.template apply<int>();
+  } else if (input.dtype() == phi::DataType::INT64) {
+    functor.template apply<int64_t>();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "emebdding input only support int32 and int64"));
+  }
+}
+
+template <typename T, typename Context>
+struct EmbeddingSparseGradCPUFunctor {
+  EmbeddingSparseGradCPUFunctor(const Context& dev_ctx,
+                                const DenseTensor& input,
+                                const DenseTensor& weight,
+                                const DenseTensor& out_grad,
+                                int64_t padding_idx,
+                                SelectedRows* weight_grad)
+      : dev_ctx_(dev_ctx),
+        input_(input),
+        weight_(weight),
+        out_grad_(out_grad),
+        weight_grad_(weight_grad),
+        padding_idx_(padding_idx) {}
+
+  template <typename IdT>
+  void apply() {
+    DDim table_dim = weight_.dims();
+
+    auto ids = CopyIdsToVector<IdT, int64_t>(input_);
+    auto ids_num = static_cast<int64_t>(ids.size());
+
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+    auto* d_table = weight_grad_;
+    auto* d_output = &out_grad_;
+    d_table->set_rows(ids);
+
+    auto* d_table_value = d_table->mutable_value();
+    d_table_value->Resize({ids_num, table_dim[1]});
+
+    dev_ctx_.template Alloc<T>(d_table_value);
+
+    d_table->set_height(table_dim[0]);
+
+    auto* d_output_data = d_output->template data<T>();
+    auto* d_table_data = d_table_value->template data<T>();
+
+    auto d_output_dims = d_output->dims();
+    auto d_output_dims_2d =
+        flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
+    PADDLE_ENFORCE_EQ(d_table_value->dims(),
+                      d_output_dims_2d,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: The shape of lookup_table@Grad and "
+                          "output@Grad should be same. "
+                          "But received lookup_table@Grad's shape = [%s], "
+                          "output@Grad's shape = [%s].",
+                          d_table_value->dims(),
+                          d_output_dims_2d));
+    memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+  }
+
+ private:
+  const Context& dev_ctx_;
+  const DenseTensor& input_;
+  const DenseTensor& weight_;
+  const DenseTensor& out_grad_;
+  SelectedRows* weight_grad_;
+  int64_t padding_idx_;
+};
+
+template <typename T, typename Context>
+void EmbeddingSparseGradKernel(const Context& ctx,
+                               const DenseTensor& input,
+                               const DenseTensor& weight,
+                               const DenseTensor& out_grad,
+                               int64_t padding_idx,
+                               SelectedRows* weight_grad) {
+  EmbeddingSparseGradCPUFunctor<T, Context> functor(
+      ctx, input, weight, out_grad, padding_idx, weight_grad);
+  if (input.dtype() == phi::DataType::INT32) {
+    functor.template apply<int>();
+  } else if (input.dtype() == phi::DataType::INT64) {
+    functor.template apply<int64_t>();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "emebdding input only support int32 and int64"));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(embedding_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EmbeddingGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(embedding_sparse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EmbeddingSparseGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/embedding_kernel.cc b/paddle/phi/kernels/cpu/embedding_kernel.cc
new file mode 100644
index 0000000000000..76cc3814b0567
--- /dev/null
+++ b/paddle/phi/kernels/cpu/embedding_kernel.cc
@@ -0,0 +1,114 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/embedding_kernel.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+struct EmbeddingCPUFunctor {
+  EmbeddingCPUFunctor(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const DenseTensor& weight,
+                      int64_t padding_idx,
+                      DenseTensor* out)
+      : dev_ctx_(dev_ctx),
+        input_(input),
+        weight_(weight),
+        out_(out),
+        padding_idx_(padding_idx) {}
+
+  template <typename IdT>
+  void apply() {
+    auto ids = CopyIdsToVector<IdT, int64_t>(input_);
+    auto ids_numel = static_cast<int64_t>(ids.size());
+
+    int64_t row_number = weight_.dims()[0];
+    int64_t row_width = weight_.dims()[1];
+
+    auto* table = weight_.data<T>();
+
+    dev_ctx_.template Alloc<T>(out_);
+    auto* output = out_->data<T>();
+
+    for (int64_t i = 0; i < ids_numel; ++i) {
+      if (padding_idx_ != kNoPadding && ids[i] == padding_idx_) {
+        memset(output + i * row_width, 0, row_width * sizeof(T));
+      } else {
+        PADDLE_ENFORCE_LT(
+            ids[i],
+            row_number,
+            phi::errors::InvalidArgument(
+                "Variable value (input) of OP(fluid.layers.embedding) "
+                "expected >= 0 and < %ld, but got %ld. Please check input "
+                "value.",
+                row_number,
+                ids[i]));
+        PADDLE_ENFORCE_GE(
+            ids[i],
+            0,
+            phi::errors::InvalidArgument(
+                "Variable value (input) of OP(fluid.layers.embedding) "
+                "expected >= 0 and < %ld, but got %ld. Please check input "
+                "value.",
+                row_number,
+                ids[i]));
+        memcpy(output + i * row_width,
+               table + ids[i] * row_width,
+               row_width * sizeof(T));
+      }
+    }
+  }
+
+ private:
+  const Context& dev_ctx_;
+  const DenseTensor& input_;
+  const DenseTensor& weight_;
+  DenseTensor* out_;
+  int64_t padding_idx_;
+};
+
+template <typename T, typename Context>
+void EmbeddingKernel(const Context& ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& weight,
+                     int64_t padding_idx,
+                     DenseTensor* out) {
+  EmbeddingCPUFunctor<T, Context> functor(ctx, input, weight, padding_idx, out);
+
+  if (input.dtype() == phi::DataType::INT32) {
+    functor.template apply<int>();
+  } else if (input.dtype() == phi::DataType::INT64) {
+    functor.template apply<int64_t>();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "emebdding input only support int32 and int64"));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(embedding,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EmbeddingKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc
new file mode 100644
index 0000000000000..338be9e252da3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/frobenius_norm_grad_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(frobenius_norm_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FrobeniusNormGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc
new file mode 100644
index 0000000000000..77509b953bf39
--- /dev/null
+++ b/paddle/phi/kernels/cpu/frobenius_norm_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/frobenius_norm_kernel.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    frobenius_norm, CPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/gather_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_grad_kernel.cc
new file mode 100644
index 0000000000000..f0a6948018afc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_grad_kernel.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_grad_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& index,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool overwrite,
+                      DenseTensor* x_grad) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+
+  if (axis_v != 0) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GatherV2GradFunction<T, int32_t>(
+          dev_ctx, &out_grad, &index, axis_v, x_grad);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GatherV2GradFunction<T, int64_t>(
+          dev_ctx, &out_grad, &index, axis_v, x_grad);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(x_grad);
+
+  auto dxt = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *dev_ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (x_grad->numel() == 0) return;
+
+  if (index_type == phi::DataType::INT32) {
+    if (overwrite) {
+      phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, out_grad, index, x_grad);
+    } else {
+      phi::funcs::ScatterAssignAdd<T, int32_t>(
+          dev_ctx, out_grad, index, x_grad);
+    }
+  } else if (index_type == phi::DataType::INT64) {
+    if (overwrite) {
+      phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, out_grad, index, x_grad);
+    } else {
+      phi::funcs::ScatterAssignAdd<T, int64_t>(
+          dev_ctx, out_grad, index, x_grad);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherGradKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/gather_kernel.cc b/paddle/phi/kernels/cpu/gather_kernel.cc
new file mode 100644
index 0000000000000..9207a05b9dcce
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& index,
+                  const Scalar& axis,
+                  DenseTensor* out) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+  if (axis_v != 0) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GatherV2Function<T, int32_t>(
+          dev_ctx, &x, &index, axis_v, out);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GatherV2Function<T, int64_t>(
+          dev_ctx, &x, &index, axis_v, out);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(out);
+
+  if (x.numel() == 0) {
+    return;
+  }
+
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::CPUGather<T, int>(dev_ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::CPUGather<T, int64_t>(dev_ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GatherKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
new file mode 100644
index 0000000000000..254c4ea5716d1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_grad_kernel.h"
+
+#include <algorithm>
+#include <cmath>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas_impl.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/gelu_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct GeluGradFunctor {
+  template <typename Device, typename X, typename dOut, typename dX>
+  void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const {
+    if (approximate) {
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto casted_dout = dout.template cast<float>();
+
+        const float kAlpha = static_cast<float>(M_2_SQRTPI * M_SQRT1_2);
+        const float kBeta =
+            kAlpha * static_cast<float>(GELU_CONSTANT) * static_cast<float>(3);
+        const auto y =
+            (kAlpha *
+             ((static_cast<float>(GELU_CONSTANT) * casted_x.cube()) + casted_x))
+                .tanh();
+        dx.device(d) = (static_cast<float>(0.5) * casted_dout *
+                        (static_cast<float>(1) + y +
+                         (casted_x - casted_x * y.square()) *
+                             (kAlpha + kBeta * casted_x.square())))
+                           .template cast<T>();
+      } else {
+        const T kAlpha = static_cast<T>(M_2_SQRTPI * M_SQRT1_2);
+        const T kBeta =
+            kAlpha * static_cast<T>(GELU_CONSTANT) * static_cast<T>(3);
+        const auto y =
+            (kAlpha * ((static_cast<T>(GELU_CONSTANT) * x.cube()) + x)).tanh();
+        dx.device(d) = static_cast<T>(0.5) * dout *
+                       (static_cast<T>(1) + y +
+                        (x - x * y.square()) * (kAlpha + kBeta * x.square()));
+      }
+    } else {
+#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
+    !defined(PADDLE_WITH_HIP)
+      auto x_data = x.data();
+      auto dx_data = dx.data();
+      auto dout_data = dout.data();
+      int n = std::min(x.size(), dx.size());
+
+      auto first = static_cast<T*>(std::malloc(n * sizeof(T)));
+      std::memset(first, 0, n * sizeof(T));
+      auto second = static_cast<T*>(std::malloc(n * sizeof(T)));
+      std::memset(second, 0, n * sizeof(T));
+
+      // first = (0.5 * (1 + erf(x / sqrt(2))))
+      phi::funcs::CBlas<T>::AXPY(
+          n, static_cast<T>(M_SQRT1_2), x_data, 1, first, 1);
+      phi::funcs::CBlas<T>::VMERF(n, first, first, VML_LA);
+      for (int i = 0; i < n; i++) {
+        first[i] += static_cast<T>(1);
+      }
+      phi::funcs::CBlas<T>::SCAL(n, static_cast<T>(0.5), first, 1);
+
+      // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2))
+      phi::funcs::CBlas<T>::VSQUARE(n, x_data, second);
+      phi::funcs::CBlas<T>::SCAL(n, -static_cast<T>(0.5), second, 1);
+      phi::funcs::CBlas<T>::VEXP(n, second, second);
+      phi::funcs::CBlas<T>::VMUL(n, x_data, second, second);
+      phi::funcs::CBlas<T>::SCAL(
+          n, static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1);
+
+      // dx = dout * (first + second);
+      phi::funcs::CBlas<T>::VADD(n, first, second, first);
+      phi::funcs::CBlas<T>::VMUL(n, dout_data, first, dx_data);
+
+      std::free(first);
+      std::free(second);
+#else
+      // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) *
+      // exp(- x^2 / 2)
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto casted_dout = dout.template cast<float>();
+        auto first = static_cast<float>(0.5) *
+                     (static_cast<float>(1) +
+                      ((casted_x * static_cast<float>(M_SQRT1_2)).erf()));
+        auto second = static_cast<float>(0.5 * M_2_SQRTPI * M_SQRT1_2) *
+                      casted_x *
+                      (-static_cast<float>(0.5) * casted_x.square()).exp();
+        dx.device(d) = (casted_dout * (first + second)).template cast<T>();
+      } else {
+        auto first =
+            static_cast<T>(0.5) *
+            (static_cast<T>(1) + ((x * static_cast<T>(M_SQRT1_2)).erf()));
+
+        auto second = static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
+                      (-static_cast<T>(0.5) * x.square()).exp();
+        dx.device(d) = dout * (first + second);
+      }
+#endif
+    }
+  }
+};
+
+template <typename T, typename Context>
+void GeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    bool approximate,
+                    DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto eigen_out_grad = EigenVector<T>::Flatten(out_grad);
+  auto eigen_x_grad = EigenVector<T>::Flatten(*x_grad);
+  auto& dev = *dev_ctx.eigen_device();
+
+  GeluGradFunctor<T> functor;
+  functor(dev, eigen_x, eigen_out_grad, eigen_x_grad, approximate);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    gelu_grad, CPU, ALL_LAYOUT, phi::GeluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc
new file mode 100644
index 0000000000000..d7af220574565
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gelu_kernel.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_kernel.h"
+#include <algorithm>
+#include <cmath>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas_impl.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T>
+struct GeluFunctor {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out, bool approximate) const {
+    if (approximate) {
+      // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3})))
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto temp =
+            (static_cast<float>(M_2_SQRTPI * M_SQRT1_2) *
+             (casted_x + static_cast<float>(GELU_CONSTANT) * casted_x.cube()))
+                .tanh();
+        out.device(d) = (casted_x * static_cast<float>(0.5) *
+                         (static_cast<float>(1) + temp))
+                            .template cast<T>();
+      } else {
+        auto temp = (static_cast<T>(M_2_SQRTPI * M_SQRT1_2) *
+                     (x + static_cast<T>(GELU_CONSTANT) * x.cube()))
+                        .tanh();
+        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+      }
+    } else {
+#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
+    !defined(PADDLE_WITH_HIP)
+      auto x_data = x.data();
+      auto out_data = out.data();
+      int n = std::min(x.size(), out.size());
+
+      std::memset(out_data, 0, n * sizeof(T));
+      phi::funcs::CBlas<T>::AXPY(
+          n, static_cast<T>(M_SQRT1_2), x_data, 1, out_data, 1);
+      phi::funcs::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
+      for (int i = 0; i < n; i++) {
+        out_data[i] += static_cast<T>(1);
+      }
+      phi::funcs::CBlas<T>::VMUL(n, x_data, out_data, out_data);
+      for (int i = 0; i < n; i++) {
+        out_data[i] *= static_cast<T>(0.5);
+      }
+#else
+      // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
+      if (std::is_same<T, dtype::float16>::value) {
+        VLOG(4) << "cast from float16 to float before computing";
+        auto casted_x = x.template cast<float>();
+        auto temp = (casted_x * static_cast<float>(M_SQRT1_2)).erf();
+        out.device(d) = (casted_x * static_cast<float>(0.5) *
+                         (static_cast<float>(1) + temp))
+                            .template cast<T>();
+      } else {
+        auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
+        out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+      }
+#endif
+    }
+  }
+};
+
+template <typename T, typename Context>
+void GeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                bool approximate,
+                DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  auto eigen_out = EigenVector<T>::Flatten(*out);
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto& dev = *dev_ctx.eigen_device();
+
+  GeluFunctor<T> functor;
+  functor(dev, eigen_x, eigen_out, approximate);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gelu, CPU, ALL_LAYOUT, phi::GeluKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
index 8538461b1b83b..6a83cee1ae40d 100644
--- a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
@@ -23,15 +23,14 @@
 namespace phi {
 
 template <typename T, typename IndexT, typename Functor>
-void GraphSendRecvCpuGradLoop(const int& input_size,
-                              const int& index_size,
+void GraphSendRecvCpuGradLoop(const int& index_size,
                               const IndexT* s_index,
                               const IndexT* d_index,
                               const DenseTensor& src,
+                              const DenseTensor& input,
                               DenseTensor* dst,
                               const std::string& pool_type,
                               const int* dst_count = nullptr,
-                              const DenseTensor* input = nullptr,
                               const DenseTensor* output = nullptr) {
   if (pool_type == "SUM") {
     Functor functor;
@@ -55,7 +54,7 @@ void GraphSendRecvCpuGradLoop(const int& input_size,
     for (int i = 0; i < index_size; ++i) {
       const IndexT& forward_src_idx = d_index[i];
       const IndexT& forward_dst_idx = s_index[i];
-      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
+      auto input_slice = input.Slice(forward_src_idx, forward_src_idx + 1);
       auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
       auto eigen_input = phi::EigenVector<T>::Flatten(input_slice);
       auto eigen_output = phi::EigenVector<T>::Flatten(output_slice);
@@ -73,18 +72,18 @@ template <typename Context, typename T, typename IndexT>
 void GraphSendRecvGradOpKernelLaunchHelper(
     const Context& ctx,
     const DenseTensor& out_grad,
+    const DenseTensor& x,
     const DenseTensor& src_index,
     const DenseTensor& dst_index,
     const std::string& pool_type,
     DenseTensor* x_grad,
     const DenseTensor* dst_count = nullptr,
-    const DenseTensor* x = nullptr,
     const DenseTensor* out = nullptr) {
   const int& index_size = dst_index.dims()[0];
 
   ctx.template Alloc<T>(x_grad);
   T* p_output = x_grad->data<T>();
-  const auto& src_dims = out_grad.dims();
+  const auto& src_dims = x.dims();
   int64_t memset_size = 1;
   for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
   const size_t& memset_bytes = memset_size * sizeof(T);
@@ -97,29 +96,22 @@ void GraphSendRecvGradOpKernelLaunchHelper(
 
   if (pool_type == "SUM") {
     GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, d_index, s_index, out_grad, x_grad, pool_type);
+        index_size, d_index, s_index, out_grad, x, x_grad, pool_type);
   } else if (pool_type == "MEAN") {
     const int* s_count = dst_count->data<int>();
     // Functor not used here.
-    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
-                                                                    index_size,
-                                                                    d_index,
-                                                                    s_index,
-                                                                    out_grad,
-                                                                    x_grad,
-                                                                    pool_type,
-                                                                    s_count);
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
+        index_size, d_index, s_index, out_grad, x, x_grad, pool_type, s_count);
   } else if (pool_type == "MIN" || pool_type == "MAX") {
     // Functor not used here.
-    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(src_dims[0],
-                                                                    index_size,
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(index_size,
                                                                     d_index,
                                                                     s_index,
                                                                     out_grad,
+                                                                    x,
                                                                     x_grad,
                                                                     pool_type,
                                                                     nullptr,
-                                                                    x,
                                                                     out);
   }
 }
@@ -127,7 +119,7 @@ void GraphSendRecvGradOpKernelLaunchHelper(
 template <typename T, typename Context>
 void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& out_grad,
-                             paddle::optional<const DenseTensor&> x,
+                             const DenseTensor& x,
                              paddle::optional<const DenseTensor&> out,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
@@ -139,23 +131,23 @@ void GraphSendRecvGradKernel(const Context& ctx,
     GraphSendRecvGradOpKernelLaunchHelper<Context, T, int32_t>(
         ctx,
         out_grad,
+        x,
         src_index,
         dst_index,
         pool_type,
         x_grad,
         dst_count.get_ptr(),
-        x.get_ptr(),
         out.get_ptr());
   } else if (index_type == phi::DataType::INT64) {
     GraphSendRecvGradOpKernelLaunchHelper<Context, T, int64_t>(
         ctx,
         out_grad,
+        x,
         src_index,
         dst_index,
         pool_type,
         x_grad,
         dst_count.get_ptr(),
-        x.get_ptr(),
         out.get_ptr());
   }
 }
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
index fecbd4b1d7aa0..8f71ba12cc4fa 100644
--- a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
@@ -83,6 +83,7 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
                                        const DenseTensor& src_index,
                                        const DenseTensor& dst_index,
                                        const std::string& pool_type,
+                                       int64_t out_size,
                                        DenseTensor* out,
                                        DenseTensor* dst_count = nullptr) {
   const int& index_size = src_index.dims()[0];
@@ -91,7 +92,16 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
   T* p_output = out->data<T>();
   const auto& src_dims = x.dims();
   int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  if (out_size <= 0) {
+    for (int i = 0; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
+  } else {
+    memset_size = out_size;
+    for (int i = 1; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
+  }
   const size_t& memset_bytes = memset_size * sizeof(T);
   memset(p_output, 0, memset_bytes);
 
@@ -129,15 +139,16 @@ void GraphSendRecvKernel(const Context& ctx,
                          const DenseTensor& src_index,
                          const DenseTensor& dst_index,
                          const std::string& pool_type,
+                         int64_t out_size,
                          DenseTensor* out,
                          DenseTensor* dst_count) {
   auto index_type = src_index.dtype();
   if (index_type == phi::DataType::INT32) {
     GraphSendRecvOpKernelLaunchHelper<Context, T, int32_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
   } else if (index_type == phi::DataType::INT64) {
     GraphSendRecvOpKernelLaunchHelper<Context, T, int64_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
new file mode 100644
index 0000000000000..923cb8424115e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
@@ -0,0 +1,357 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+static inline void ClipWithMask(const CPUContext& ctx,
+                                const int max_val,  // height-1 or width-1
+                                bool align_corners,
+                                std::string padding_mode,
+                                DenseTensor* grid_slice,
+                                DenseTensor* grid_scale) {
+  auto& place = *ctx.eigen_device();
+  grid_scale->Resize(grid_slice->dims());
+  ctx.Alloc<T>(grid_scale);
+
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  auto factor = static_cast<T>(max_val * 0.5);
+  if (!align_corners) {
+    factor = static_cast<T>((max_val + 1) * 0.5);
+  }
+  auto grid_scale_t = EigenTensor<T, 3>::From(*grid_scale).setConstant(factor);
+
+  if (padding_mode == "border") {
+    //    auto bounded_lo = grid_slice_t.cwiseMax(static_cast<T>(0));
+    auto res = grid_slice_t.cwiseMax(static_cast<T>(0))
+                   .cwiseMin(static_cast<T>(max_val));
+
+    auto in_bound = (res == grid_slice_t);
+    grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
+    grid_slice_t.device(place) = res;
+  } else if (padding_mode == "reflection") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto is_neg = (grid_slice_t < static_cast<T>(0));
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>());
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+      if (max_val == 0) {
+        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
+      }
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto is_neg = ((grid_slice_t + static_cast<T>(0.5)) < static_cast<T>(0));
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      auto one_more_flip = (extra > (double_range - extra));
+      auto reflected =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      auto clipped = reflected.cwiseMax(static_cast<T>(0))
+                         .cwiseMin(static_cast<T>(max_val));
+      auto in_bound = (clipped == reflected).template cast<T>();
+      grid_scale_t.device(place) =
+          grid_scale_t * ((is_neg == one_more_flip).template cast<T>() -
+                          (is_neg != one_more_flip).template cast<T>()) *
+          in_bound;
+      grid_slice_t.device(place) = clipped;
+    }
+  }
+}
+
+template <typename T>
+static void CalcGridLocationsWithGrad(const CPUContext& ctx,
+                                      const DenseTensor& grid,
+                                      const int in_h,
+                                      const int in_w,
+                                      bool align_corners,
+                                      std::string padding_mode,
+                                      DenseTensor* grid_x,
+                                      DenseTensor* grid_y,
+                                      DenseTensor* grid_x_scale,
+                                      DenseTensor* grid_y_scale) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  grid_x->Resize({n, out_h, out_w});
+  grid_y->Resize({n, out_h, out_w});
+  T* grid_x_data = ctx.Alloc<T>(grid_x);
+  T* grid_y_data = ctx.Alloc<T>(grid_y);
+
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
+
+  Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  ClipWithMask<T>(
+      ctx, in_w - 1, align_corners, padding_mode, grid_x, grid_x_scale);
+  ClipWithMask<T>(
+      ctx, in_h - 1, align_corners, padding_mode, grid_y, grid_y_scale);
+}
+
+template <typename T>
+static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
+                                        DenseTensor* input_grad,
+                                        const DenseTensor& x,
+                                        const DenseTensor& y,
+                                        const DenseTensor& d1,
+                                        const DenseTensor& d2) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto d1_t = EigenTensor<T, 3>::From(d1);
+  auto d2_t = EigenTensor<T, 3>::From(d2);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            input_grad_t(i,
+                         j,
+                         static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void GatherBilinearGrad(const CPUContext& ctx,
+                               const DenseTensor& input,
+                               const DenseTensor& output_grad,
+                               DenseTensor* grid_x,
+                               DenseTensor* grid_y,
+                               DenseTensor* grid_x_scale,
+                               DenseTensor* grid_y_scale,
+                               DenseTensor* input_grad,
+                               DenseTensor* grid_grad) {
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+
+  DenseTensor x_w, x_e, y_n, y_s;
+  DenseTensor d_w, d_e, d_n, d_s;
+  DenseTensor v_wn, v_en, v_ws, v_es;
+
+  AllNeigbors<T>(ctx,
+                 input,
+                 grid_x,  // grid_x
+                 grid_y,  // grid_y
+                 &x_w,
+                 &x_e,
+                 &y_n,
+                 &y_s,
+                 &d_w,
+                 &d_e,
+                 &d_n,
+                 &d_s,
+                 &v_wn,
+                 &v_en,
+                 &v_ws,
+                 &v_es);
+
+  // gather output grad value to input grad by corner point coords and weight
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_n, d_e, d_s);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_w, y_s, d_e, d_n);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_n, d_w, d_s);
+  GatherOutputGradToInputGrad<T>(output_grad, input_grad, x_e, y_s, d_w, d_n);
+
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  if (grid_grad != nullptr) {
+    DenseTensor grid_grad_x, grid_grad_y;
+    grid_grad_x.Resize({n, out_h, out_w});
+    grid_grad_y.Resize({n, out_h, out_w});
+    ctx.Alloc<T>(&grid_grad_x);
+    ctx.Alloc<T>(&grid_grad_y);
+    auto grid_grad_x_t =
+        EigenTensor<T, 3>::From(grid_grad_x).setConstant(static_cast<T>(0.0));
+    auto grid_grad_y_t =
+        EigenTensor<T, 3>::From(grid_grad_y).setConstant(static_cast<T>(0.0));
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+        for (int k = 0; k < out_h; k++) {
+          for (int l = 0; l < out_w; l++) {
+            grid_grad_x_t(i, k, l) +=
+                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
+                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
+                output_grad_t(i, j, k, l);
+            grid_grad_y_t(i, k, l) +=
+                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
+                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
+                output_grad_t(i, j, k, l);
+          }
+        }
+      }
+    }
+
+    //  const T x_max = static_cast<T>(in_w - 1);
+    //  const T y_max = static_cast<T>(in_h - 1);
+
+    auto grid_x_scale_t = EigenTensor<T, 3>::From(*grid_x_scale);
+    auto grid_y_scale_t = EigenTensor<T, 3>::From(*grid_y_scale);
+    grid_grad_x_t = grid_grad_x_t * grid_x_scale_t;
+    grid_grad_y_t = grid_grad_y_t * grid_y_scale_t;
+
+    // gather grid_grad [x, y] in 3rd Dim
+    T* grid_grad_data = grid_grad->data<T>();
+    T* grid_grad_x_data = grid_grad_x.data<T>();
+    T* grid_grad_y_data = grid_grad_y.data<T>();
+    for (int i = 0; i < n * out_h * out_w; i++) {
+      grid_grad_data[2 * i] = grid_grad_x_data[i];
+      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
+    }
+  }
+}
+
+template <typename T>
+static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
+                                        DenseTensor* input_grad,
+                                        const DenseTensor& x,
+                                        const DenseTensor& y) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int out_h = output_grad.dims()[2];
+  const int out_w = output_grad.dims()[3];
+  const int in_h = input_grad->dims()[2];
+  const int in_w = input_grad->dims()[3];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            input_grad_t(i,
+                         j,
+                         static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grid,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+
+  x_grad->Resize({n, c, in_h, in_w});
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+  if (grid_grad != nullptr) {
+    grid_grad->Resize({n, out_h, out_w, 2});
+    dev_ctx.template Alloc<T>(grid_grad);
+    phi::funcs::SetConstant<Context, T>()(
+        dev_ctx, grid_grad, static_cast<T>(0));
+  }
+
+  DenseTensor grid_x, grid_y;
+  DenseTensor grid_x_scale, grid_y_scale;
+  CalcGridLocationsWithGrad<T>(dev_ctx,
+                               grid,
+                               in_h,
+                               in_w,
+                               align_corners,
+                               padding_mode,
+                               &grid_x,
+                               &grid_y,
+                               &grid_x_scale,
+                               &grid_y_scale);
+  if (mode == "bilinear") {
+    GatherBilinearGrad<T>(dev_ctx,
+                          x,
+                          out_grid,
+                          &grid_x,
+                          &grid_y,
+                          &grid_x_scale,
+                          &grid_y_scale,
+                          x_grad,
+                          grid_grad);
+  } else {
+    auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+    auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+    grid_x_t = grid_x_t.round();
+    grid_y_t = grid_y_t.round();
+    GatherOutputGradToInputGrad<T>(out_grid, x_grad, grid_x, grid_y);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(grid_sample_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GridSampleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/grid_sample_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
new file mode 100644
index 0000000000000..92a528cdda96a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
@@ -0,0 +1,184 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using Array4 = Eigen::DSizes<int64_t, 4>;
+
+template <typename T>
+static inline void Clip(const CPUContext& ctx,
+                        DenseTensor* grid_slice,
+                        const int max_val,  // height-1 or width-1
+                        bool align_corners,
+                        std::string padding_mode) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+  if (padding_mode == "border") {
+    grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                     .cwiseMin(static_cast<T>(max_val));
+  } else if (padding_mode == "reflection") {
+    if (align_corners) {
+      auto double_range = static_cast<T>(max_val * 2);
+      auto grid_abs = grid_slice_t.abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) = extra.cwiseMin(double_range - extra);
+      if (max_val == 0) {
+        grid_slice_t.device(place) = grid_slice_t.constant(static_cast<T>(0));
+      }
+    } else {
+      auto double_range = static_cast<T>((max_val + 1) * 2);
+      auto grid_abs = (grid_slice_t + static_cast<T>(0.5)).abs();
+      auto extra = grid_abs - (grid_abs / double_range).floor() * double_range;
+      grid_slice_t.device(place) =
+          extra.cwiseMin(double_range - extra) - static_cast<T>(0.5);
+      grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
+                                       .cwiseMin(static_cast<T>(max_val));
+    }
+  }
+}
+
+template <typename T>
+static void CalcGridLocations(const CPUContext& ctx,
+                              const DenseTensor& grid,
+                              const int in_h,
+                              const int in_w,
+                              bool align_corners,
+                              std::string padding_mode,
+                              DenseTensor* grid_x,
+                              DenseTensor* grid_y) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  grid_x->Resize({n, out_h, out_w});
+  grid_y->Resize({n, out_h, out_w});
+  T* grid_x_data = ctx.Alloc<T>(grid_x);
+  T* grid_y_data = ctx.Alloc<T>(grid_y);
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * out_h * out_w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
+
+  Unnormalize<T>(ctx, grid_x, in_w - 1, align_corners);
+  Unnormalize<T>(ctx, grid_y, in_h - 1, align_corners);
+
+  Clip<T>(ctx, grid_x, in_w - 1, align_corners, padding_mode);
+  Clip<T>(ctx, grid_y, in_h - 1, align_corners, padding_mode);
+}
+
+template <typename T>
+static void BilinearInter(const CPUContext& ctx,
+                          const DenseTensor& input,
+                          DenseTensor* grid_x,
+                          DenseTensor* grid_y,
+                          DenseTensor* out) {
+  auto& place = *ctx.eigen_device();
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  const int c = input.dims()[1];
+
+  DenseTensor x_w, x_e, y_n, y_s;
+  DenseTensor d_w, d_e, d_n, d_s;
+  DenseTensor v_wn, v_en, v_ws, v_es;
+
+  AllNeigbors<T>(ctx,
+                 input,
+                 grid_x,
+                 grid_y,
+                 &x_w,
+                 &x_e,
+                 &y_n,
+                 &y_s,
+                 &d_w,
+                 &d_e,
+                 &d_n,
+                 &d_s,
+                 &v_wn,
+                 &v_en,
+                 &v_ws,
+                 &v_es);
+
+  auto d_w_t = EigenTensor<T, 3>::From(d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+  auto d_w_scaled_t =
+      d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_e_scaled_t =
+      d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_n_scaled_t =
+      d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto d_s_scaled_t =
+      d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1));
+  auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+  auto v_en_t = EigenTensor<T, 4>::From(v_en);
+  auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+  auto v_es_t = EigenTensor<T, 4>::From(v_es);
+  auto output_t = EigenTensor<T, 4>::From(*out);
+  // bilinear interpolaetion by 4 corner points
+  output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
+                           v_en_t * d_w_scaled_t * d_s_scaled_t +
+                           v_ws_t * d_e_scaled_t * d_n_scaled_t +
+                           v_es_t * d_w_scaled_t * d_n_scaled_t;
+}
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+
+  out->Resize(phi::make_ddim({n, c, out_h, out_w}));
+  dev_ctx.template Alloc<T>(out);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, out, static_cast<T>(0));
+
+  DenseTensor grid_x, grid_y;
+  CalcGridLocations<T>(
+      dev_ctx, grid, in_h, in_w, align_corners, padding_mode, &grid_x, &grid_y);
+
+  if (mode == "bilinear") {
+    BilinearInter<T>(dev_ctx, x, &grid_x, &grid_y, out);
+  } else if (mode == "nearest") {
+    auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+    auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+    grid_x_t = grid_x_t.round();
+    grid_y_t = grid_y_t.round();
+    GetGridPointValue<T>(x, out, grid_x, grid_y);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    grid_sample, CPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/grid_sample_utils.h b/paddle/phi/kernels/cpu/grid_sample_utils.h
new file mode 100644
index 0000000000000..53a16446d7e8c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/grid_sample_utils.h
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+void Unnormalize(const CPUContext& ctx,
+                 DenseTensor* grid_slice,
+                 const int max_val,  // height-1 or width-1
+                 bool align_corners) {
+  auto& place = *ctx.eigen_device();
+  auto grid_slice_t = EigenTensor<T, 3>::From(*grid_slice);
+
+  if (!align_corners) {
+    auto factor = static_cast<T>((max_val + 1) * 0.5);
+    grid_slice_t.device(place) =
+        (grid_slice_t + static_cast<T>(1)) * factor - static_cast<T>(0.5);
+  } else {
+    auto factor = static_cast<T>(max_val * 0.5);
+    grid_slice_t.device(place) = (grid_slice_t + static_cast<T>(1)) * factor;
+  }
+}
+
+template <typename T>
+inline bool IsInBound(T x, T y, T x_max, T y_max) {
+  if (x < 0 || x > x_max || y < 0 || y > y_max) {
+    return false;
+  }
+  return true;
+}
+
+template <typename T>
+void GetGridPointValue(const DenseTensor& input,
+                       DenseTensor* output,
+                       const DenseTensor& x,
+                       const DenseTensor& y) {
+  const int n = input.dims()[0];
+  const int c = input.dims()[1];
+  const int in_h = input.dims()[2];
+  const int in_w = input.dims()[3];
+  const int out_h = x.dims()[1];
+  const int out_w = x.dims()[2];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
+  auto input_t = EigenTensor<T, 4>::From(input);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < out_h; k++) {
+      for (int l = 0; l < out_w; l++) {
+        if (IsInBound(
+                x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) {
+          for (int j = 0; j < c; j++) {
+            output_t(i, j, k, l) =
+                input_t(i,
+                        j,
+                        static_cast<int>(round(y_t(i, k, l))),
+                        static_cast<int>(round(x_t(i, k, l))));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void AllNeigbors(const CPUContext& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* grid_x,
+                 DenseTensor* grid_y,
+                 DenseTensor* x_w,
+                 DenseTensor* x_e,
+                 DenseTensor* y_n,
+                 DenseTensor* y_s,  // positions
+                 DenseTensor* d_w,
+                 DenseTensor* d_e,
+                 DenseTensor* d_n,
+                 DenseTensor* d_s,  // distance
+                 DenseTensor* v_wn,
+                 DenseTensor* v_en,
+                 DenseTensor* v_ws,
+                 DenseTensor* v_es) {  // values
+  auto& place = *ctx.eigen_device();
+
+  const int c = input.dims()[1];
+  const int n = grid_x->dims()[0];
+  const int out_h = grid_x->dims()[1];
+  const int out_w = grid_x->dims()[2];
+  // calculate coords of 4 corner points
+  x_w->Resize({n, out_h, out_w});
+  x_e->Resize({n, out_h, out_w});
+  y_n->Resize({n, out_h, out_w});
+  y_s->Resize({n, out_h, out_w});
+  ctx.Alloc<T>(x_w);
+  ctx.Alloc<T>(x_e);
+  ctx.Alloc<T>(y_n);
+  ctx.Alloc<T>(y_s);
+  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
+  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
+  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
+  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
+
+  auto grid_x_t = EigenTensor<T, 3>::From(*grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(*grid_y);
+
+  x_w_t.device(place) = grid_x_t.floor();
+  x_e_t.device(place) = x_w_t + static_cast<T>(1);
+  y_n_t.device(place) = grid_y_t.floor();
+  y_s_t.device(place) = y_n_t + static_cast<T>(1);
+
+  // calculate distances to 4 sides
+  d_w->Resize({n, out_h, out_w});
+  d_e->Resize({n, out_h, out_w});
+  d_n->Resize({n, out_h, out_w});
+  d_s->Resize({n, out_h, out_w});
+  ctx.Alloc<T>(d_w);
+  ctx.Alloc<T>(d_e);
+  ctx.Alloc<T>(d_n);
+  ctx.Alloc<T>(d_s);
+  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
+  d_w_t.device(place) = grid_x_t - x_w_t;
+  d_e_t.device(place) = x_e_t - grid_x_t;
+  d_n_t.device(place) = grid_y_t - y_n_t;
+  d_s_t.device(place) = y_s_t - grid_y_t;
+
+  // calc 4 corner points value
+  v_wn->Resize({n, c, out_h, out_w});
+  v_en->Resize({n, c, out_h, out_w});
+  v_ws->Resize({n, c, out_h, out_w});
+  v_es->Resize({n, c, out_h, out_w});
+  ctx.Alloc<T>(v_wn);
+  ctx.Alloc<T>(v_en);
+  ctx.Alloc<T>(v_ws);
+  ctx.Alloc<T>(v_es);
+  GetGridPointValue<T>(input, v_wn, *x_w, *y_n);
+  GetGridPointValue<T>(input, v_en, *x_e, *y_n);
+  GetGridPointValue<T>(input, v_ws, *x_w, *y_s);
+  GetGridPointValue<T>(input, v_es, *x_e, *y_s);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
new file mode 100644
index 0000000000000..b79aab96c0fc2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace math = paddle::operators::math;
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernelImpl(
+    const Context& ctx,
+    const DenseTensor& x,
+    const DenseTensor& w,
+    const DenseTensor& label,
+    const DenseTensor& pre_out,
+    const DenseTensor& out_grad,
+    paddle::optional<const DenseTensor&> path,
+    paddle::optional<const DenseTensor&> code,
+    paddle::optional<const DenseTensor&> bias,
+    int num_classes,
+    bool remote_prefetch,
+    int trainer_id,
+    const std::vector<int64_t>& height_sections,
+    const std::vector<std::string>& epmap,
+    const std::vector<std::string>& table_names,
+    bool is_sparse,
+    DenseTensor* x_grad,
+    DenseTensor* w_grad,
+    DenseTensor* bias_grad,
+    SelectedRows* w_grad_sr = nullptr) {
+  funcs::SetConstant<Context, T> zero;
+  DenseTensor pre_out_grad;
+
+  pre_out_grad.Resize(pre_out.dims());
+  ctx.template Alloc<T>(&pre_out_grad);
+  ctx.template Alloc<T>(x_grad);
+  zero(ctx, x_grad, static_cast<T>(0.0));
+
+  bool is_custom = false;
+  if (path.get_ptr()) {
+    is_custom = true;
+  }
+
+  std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
+  if (!is_custom) {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        num_classes, label.template data<int64_t>()));
+  } else {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        *(path.get_ptr()), *(code.get_ptr()), label.template data<int64_t>()));
+  }
+
+  // softrelu derivative
+
+  auto blas = funcs::GetBlas<Context, T>(ctx);
+
+  auto* pre_out_grad_data = pre_out_grad.data<T>();
+  auto* pre_out_data = pre_out.template data<T>();
+  auto n = pre_out.numel();
+  blas.VEXP(n, pre_out_data, pre_out_grad_data);
+  blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
+  for (int64_t i = 0; i < n; ++i) {
+    pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
+  }
+  bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
+  auto* out_grad_data = out_grad.template data<T>();
+
+  int64_t dim0 = pre_out_grad.dims()[0];
+  int64_t dim1 = pre_out_grad.dims()[1];
+  for (int64_t i = 0; i < dim0; ++i) {
+    T tmp = out_grad_data[i];
+    blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
+  }
+  // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
+  // be consistent with the clipping in forward.
+  if (bias_grad) {
+    ctx.template Alloc<T>(bias_grad);
+    zero(ctx, bias_grad, static_cast<T>(0.0));
+    bit_code->AddGrad(pre_out_grad, bias_grad);
+  }
+  ctx.template Alloc<T>(w_grad);
+  zero(ctx, w_grad, static_cast<T>(0.0));
+  if (!is_sparse) {
+    bit_code->MulGradWeight(pre_out_grad, w_grad, x);
+  } else {
+    bit_code->MulGradWeight(pre_out_grad, w_grad_sr, x);
+  }
+  bit_code->MulGradError(pre_out_grad, w, x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
new file mode 100644
index 0000000000000..f64a1a8162a37
--- /dev/null
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   DenseTensor* w_grad,
+                                   DenseTensor* bias_grad) {
+  HierarchicalSigmoidGradKernelImpl<T>(ctx,
+                                       x,
+                                       w,
+                                       label,
+                                       pre_out,
+                                       out_grad,
+                                       path,
+                                       code,
+                                       bias,
+                                       num_classes,
+                                       remote_prefetch,
+                                       trainer_id,
+                                       height_sections,
+                                       epmap,
+                                       table_names,
+                                       is_sparse,
+                                       x_grad,
+                                       w_grad,
+                                       bias_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(hierarchical_sigmoid_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::HierarchicalSigmoidGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
new file mode 100644
index 0000000000000..096a54f9fb263
--- /dev/null
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/hierarchical_sigmoid_kernel.h"
+
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function_impl.h"
+
+namespace phi {
+
+namespace math = paddle::operators::math;
+
+template <typename T, typename Context>
+void HierarchicalSigmoidKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& w,
+                               const DenseTensor& label,
+                               paddle::optional<const DenseTensor&> path,
+                               paddle::optional<const DenseTensor&> code,
+                               paddle::optional<const DenseTensor&> bias,
+                               int num_classes,
+                               bool remote_prefetch,
+                               int trainer_id,
+                               const std::vector<int64_t>& height_sections,
+                               const std::vector<std::string>& epmap,
+                               const std::vector<std::string>& table_names,
+                               bool is_sparse,
+                               DenseTensor* out,
+                               DenseTensor* pre_out,
+                               DenseTensor* w_out) {
+  size_t num_classes_st = static_cast<size_t>(num_classes);
+  // for remote prefetch
+
+  bool is_custom = false;
+  if (path.get_ptr()) {
+    is_custom = true;
+  }
+  int64_t code_length = path.get_ptr() ? path.get_ptr()->dims()[1]
+                                       : math::FindLastSet(num_classes_st - 1);
+  int64_t batch_size = x.dims()[0];
+  DenseTensor sum;
+  pre_out->Resize(phi::make_ddim({batch_size, code_length}));
+  ctx.template Alloc<T>(pre_out);
+  auto* pre_out_data = pre_out->data<T>();
+  auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+  // Not all class(leaf) nodes' path lengths equal code_length, thus init as
+  // 0s can avoid out of path's loss.
+  funcs::SetConstant<Context, T> zero;
+  zero(ctx, pre_out, static_cast<T>(0.0));
+  auto& place = *ctx.eigen_device();
+  funcs::RowwiseSum<Context, T> row_sum;
+
+  std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
+  if (!is_custom) {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        num_classes_st, label.template data<int64_t>()));
+  } else {
+    bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+        *(path.get_ptr()), *(code.get_ptr()), label.template data<int64_t>()));
+  }
+
+  std::vector<int64_t> sum_dims({batch_size, 1UL});
+  sum.Resize(phi::make_ddim(sum_dims));
+  ctx.template Alloc<T>(&sum);
+  auto sum_mat = EigenMatrix<T>::From(sum);
+  ctx.template Alloc<T>(out);
+  auto out_mat = EigenMatrix<T>::From(*out);
+  if (bias.get_ptr()) {
+    bit_code->Add(*(bias.get_ptr()), pre_out);
+  }
+  bit_code->Mul(pre_out, w, x);
+  // clip to [-40, 40]
+  paddle::platform::Transform<Context> trans;
+  trans(ctx,
+        pre_out_data,
+        pre_out_data + pre_out->numel(),
+        pre_out_data,
+        paddle::operators::ClipFunctor<T>(static_cast<T>(-40.0),
+                                          static_cast<T>(40.0)));
+  bit_code->Sum(*pre_out, out, static_cast<T>(-1));
+  // use softrelu to calculate cross entropy
+  pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
+  row_sum(ctx, *pre_out, &sum);
+  // TODO(guosheng): Subtract the out of path's loss, since not all
+  // class(leaf) nodes' path lengths equal code_length. But it won't break the
+  // gradient check since both have the out of path's loss and will cancel out
+  // each other.
+  out_mat.device(place) = sum_mat + out_mat;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(hierarchical_sigmoid,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::HierarchicalSigmoidKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc
index 21bf9faee13cf..b895e4aa7c0e7 100644
--- a/paddle/phi/kernels/cpu/index_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -41,7 +41,7 @@ void IndexSampleInner(const Context &context,
   std::vector<T> input_vec;
   std::vector<IndexT> index_vec;
   paddle::framework::TensorToVector(input, context, &input_vec);
-  paddle::framework::TensorToVector(index, context, &index_vec);
+  paddle::framework::TensorToVector<IndexT>(index, context, &index_vec);
 
   std::vector<T> res(index_ids_num);
   for (int i = 0; i < index_ids_num; i++) {
diff --git a/paddle/phi/kernels/cpu/index_select_grad_kernel.cc b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
new file mode 100644
index 0000000000000..9dd50e7df8f06
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/cpu/index_select_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           const DenseTensor& out_grad,
+                           int dim,
+                           DenseTensor* x_grad) {
+  if (dim < 0) {
+    dim += out_grad.dims().size();
+  }
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    IndexSelectGradInner<Context, T, int>(ctx, out_grad, index, x_grad, dim);
+  } else if (index_type == phi::DataType::INT64) {
+    IndexSelectGradInner<Context, T, int64_t>(
+        ctx, out_grad, index, x_grad, dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/index_select_impl.h b/paddle/phi/kernels/cpu/index_select_impl.h
new file mode 100644
index 0000000000000..163174580ff78
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_select_impl.h
@@ -0,0 +1,178 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename Context, typename T, class Enable = void>
+struct IndexSelectAdd {
+  void operator()(const Context& ctx,
+                  int slice_size,
+                  const T* src_pointer,
+                  const T* p_pointer,
+                  T* dist_pointer) {
+    for (int i = 0; i < slice_size; i++) {
+      dist_pointer[i] = src_pointer[i] + p_pointer[i];
+    }
+  }
+};
+
+template <typename Context, typename T>
+struct IndexSelectAdd<
+    Context,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const Context& ctx,
+                  int slice_size,
+                  const T* src_pointer,
+                  const T* p_pointer,
+                  T* dist_pointer) {
+    auto blas = phi::funcs::GetBlas<Context, T>(ctx);
+    blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer);
+  }
+};
+
+template <typename Context, typename T, typename IndexT = int>
+void IndexSelectInner(const Context& ctx,
+                      DenseTensor* input,
+                      const DenseTensor& index,
+                      DenseTensor* output,
+                      int dim) {
+  auto input_dim = input->dims();
+  auto input_dim_size = input_dim.size();
+  auto output_dim = output->dims();
+  auto index_size = index.dims()[0];
+
+  DenseTensor index_cpu_copy;
+  if (!paddle::platform::is_cpu_place(index.place())) {
+    phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy);
+  }
+  const IndexT* index_data = paddle::platform::is_cpu_place(index.place())
+                                 ? index.data<IndexT>()
+                                 : index_cpu_copy.data<IndexT>();
+  ctx.template Alloc<T>(output);
+
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_dim_size; i++) {
+    slice_size *= input_dim[i];
+  }
+
+  auto outer_nums = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_nums *= input_dim[i];
+  }
+
+  for (int i = 0; i < index_size; i++) {
+    PADDLE_ENFORCE_GE(
+        index_data[i],
+        0,
+        phi::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim],
+            index_data[i]));
+    PADDLE_ENFORCE_LT(
+        index_data[i],
+        input_dim[dim],
+        phi::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim],
+            index_data[i]));
+  }
+
+  VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums
+          << "; slice_size: " << slice_size << "; index_size: " << index_size;
+
+  input->Resize(phi::make_ddim({outer_nums, input_dim[dim], slice_size}));
+  output->Resize(phi::make_ddim({outer_nums, index_size, slice_size}));
+
+  auto input_tensor = EigenTensor<T, 3>::From(*input);
+  auto output_tensor = EigenTensor<T, 3>::From(*output);
+
+  auto& place = *ctx.eigen_device();
+
+  for (auto j = 0; j < index_size; j++) {
+    IndexT index_value = index_data[j];
+    auto output_t = output_tensor.chip(j, 1);
+    output_t.device(place) = input_tensor.chip(index_value, 1);
+  }
+  input->Resize(input_dim);
+  output->Resize(output_dim);
+}
+
+template <typename Context, typename T, typename IndexT = int>
+void IndexSelectGradInner(const Context& ctx,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& index,
+                          DenseTensor* x_grad,
+                          int dim) {
+  const T* input_data = out_grad.data<T>();
+  const IndexT* index_data = index.data<IndexT>();
+
+  const T* p_output = ctx.template Alloc<T>(x_grad);
+  T* out_data = ctx.template Alloc<T>(x_grad);
+
+  auto input_dim = out_grad.dims();
+  auto input_dim_size = input_dim.size();
+  auto output_dim = x_grad->dims();
+
+  phi::funcs::SetConstant<Context, T> set_constant;
+  set_constant(ctx, x_grad, static_cast<T>(0.0));
+
+  auto slice_size = 1;
+  for (auto i = dim + 1; i < input_dim_size; i++) {
+    slice_size *= input_dim[i];
+  }
+
+  auto input_width = slice_size * input_dim[dim];
+  auto output_width = slice_size * output_dim[dim];
+
+  auto outer_nums = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_nums *= input_dim[i];
+  }
+
+  auto index_size = index.dims()[0];
+  VLOG(3) << "Index_Select_Grad_Debug; outer_nums: " << outer_nums
+          << "; slice_size: " << slice_size << "; input_width: " << input_width
+          << "; output_width: " << output_width
+          << "; index_size: " << index_size;
+
+  for (auto i = 0; i < outer_nums; i++) {
+    auto input_start_offset = i * input_width;
+    auto output_start_offset = i * output_width;
+
+    for (auto j = 0; j < index_size; j++) {
+      IndexT index_value = index_data[j];
+      auto src = input_data + input_start_offset + j * slice_size;
+      auto p_out = p_output + output_start_offset + index_value * slice_size;
+      auto dst = out_data + output_start_offset + index_value * slice_size;
+      IndexSelectAdd<Context, T> index_select_add;
+      index_select_add(ctx, slice_size, src, p_out, dst);
+    }
+  }
+  x_grad->Resize(output_dim);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/index_select_kernel.cc b/paddle/phi/kernels/cpu/index_select_kernel.cc
new file mode 100644
index 0000000000000..5341ede6b2fd8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_select_kernel.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/cpu/index_select_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       int dim,
+                       DenseTensor* output) {
+  auto inputs = x;
+  if (dim < 0) {
+    dim += inputs.dims().size();
+  }
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    IndexSelectInner<Context, T, int>(ctx, &inputs, index, output, dim);
+  } else if (index_type == phi::DataType::INT64) {
+    IndexSelectInner<Context, T, int64_t>(ctx, &inputs, index, output, dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/isclose_kernel.cc b/paddle/phi/kernels/cpu/isclose_kernel.cc
new file mode 100644
index 0000000000000..633c6ba093e42
--- /dev/null
+++ b/paddle/phi/kernels/cpu/isclose_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isclose_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    isclose, CPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
new file mode 100644
index 0000000000000..f9399d38d711f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    kldiv_loss_grad, CPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
new file mode 100644
index 0000000000000..c462b8ec32c89
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
+
+namespace phi {}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    kldiv_loss, CPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
new file mode 100644
index 0000000000000..185d6cbedc85d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+template <typename T, typename Type>
+static void kthvalueAssign(const Type& input_height,
+                           const Type& input_width,
+                           const int& input_dim,
+                           const DenseTensor* input,
+                           const DenseTensor* indices,
+                           T* output_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      output_data[i * input_width + e_indices(0)] = e_input(0);
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x) {
+  auto in_dims = x.dims();
+  auto out_dims = indices.dims();
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+  if (!keepdim) {
+    std::vector<int> tmp_out_shape;
+    for (int i = 0; i < axis; i++) {
+      tmp_out_shape.emplace_back(out_dims[i]);
+    }
+    tmp_out_shape.emplace_back(1);
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      tmp_out_shape.emplace_back(out_dims[i - 1]);
+    }
+    out_dims = phi::make_ddim(tmp_out_shape);
+  }
+  T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+  if (axis == in_dims.size() - 1) {
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+    memset(x_grad_data, 0, d_x->numel() * sizeof(T));
+    if (keepdim) {
+      kthvalueAssign(input_height,
+                     input_width,
+                     in_dims.size(),
+                     &d_out,
+                     &indices,
+                     x_grad_data);
+    } else {
+      DenseTensor out_grad_tmp, indices_tmp;
+      out_grad_tmp.Resize(d_out.dims());
+      indices_tmp.Resize(indices.dims());
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+      Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      kthvalueAssign(input_height,
+                     input_width,
+                     in_dims.size(),
+                     &out_grad_tmp,
+                     &indices_tmp,
+                     x_grad_data);
+    }
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    DDim trans_dims(out_dims);
+    DDim trans_in_dims(in_dims);
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = out_dims[trans[i]];
+      trans_in_dims[i] = in_dims[trans[i]];
+    }
+    DenseTensor trans_dO, trans_ind;
+    trans_dO.Resize(trans_dims);
+    trans_ind.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_dO);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+    int ndims = trans.size();
+    if (keepdim) {
+      funcs::TransCompute<phi::CPUContext, T>(
+          ndims, dev_ctx, d_out, &trans_dO, trans);
+      funcs::TransCompute<phi::CPUContext, int64_t>(
+          ndims, dev_ctx, indices, &trans_ind, trans);
+    } else {
+      DenseTensor out_grad_tmp, indices_tmp;
+      out_grad_tmp.Resize(d_out.dims());
+      indices_tmp.Resize(indices.dims());
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+      Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      funcs::TransCompute<phi::CPUContext, T>(
+          ndims, dev_ctx, out_grad_tmp, &trans_dO, trans);
+      funcs::TransCompute<phi::CPUContext, int64_t>(
+          ndims, dev_ctx, indices_tmp, &trans_ind, trans);
+    }
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+    const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, d_x->numel() * sizeof(T));
+    kthvalueAssign<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &trans_dO,
+                               &trans_ind,
+                               t_out);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, d_x, trans);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/kthvalue_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
new file mode 100644
index 0000000000000..5e436623cae7b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+template <typename T, typename Type>
+static void getKthvalue(Type input_height,
+                        Type input_width,
+                        int input_dim,
+                        const DenseTensor* input,
+                        T* t_out,
+                        Type* t_indices,
+                        const int& k) {
+  bool partial_sort_flag = (k * 64) < input_width;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(),
+          col_vec.begin() + k,
+          col_vec.end(),
+          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            return (!std::isnan(static_cast<double>(l.first)) &&
+                    std::isnan(static_cast<double>(r.first))) ||
+                   (l.first < r.first);
+          });
+    } else {
+      std::nth_element(
+          col_vec.begin(),
+          col_vec.begin() + k - 1,
+          col_vec.end(),
+          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            return (!std::isnan(static_cast<double>(l.first)) &&
+                    std::isnan(static_cast<double>(r.first))) ||
+                   (l.first < r.first);
+          });
+    }
+    t_out[i] = col_vec[k - 1].first;
+    t_indices[i] = col_vec[k - 1].second;
+  }
+}
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* output,
+                    DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  if (axis < 0) axis += in_dims.size();
+  T* output_data = dev_ctx.template Alloc<T>(output);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  auto out_dims = output->dims();
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    getKthvalue<T, int64_t>(input_height,
+                            input_width,
+                            in_dims.size(),
+                            &x,
+                            output_data,
+                            indices_data,
+                            k);
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
+      output->Resize(tmp_out_dims);
+      indices->Resize(tmp_out_dims);
+    }
+    DDim trans_dims(in_dims);
+    DDim trans_out_dims(in_dims);
+
+    for (size_t i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = in_dims[trans[i]];
+    }
+    trans_out_dims[in_dims.size() - 1] = 1;
+    DenseTensor trans_inp;
+    trans_inp.Resize(trans_dims);
+    dev_ctx.template Alloc<T>(&trans_inp);
+    int ndims = trans.size();
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, x, &trans_inp, trans);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+    DenseTensor tmp_out, tmp_indices;
+    tmp_out.Resize(trans_out_dims);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    tmp_indices.Resize(trans_out_dims);
+    int64_t* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+    getKthvalue<T, int64_t>(
+        input_height, input_width, in_dims.size(), &trans_inp, t_out, t_ind, k);
+    funcs::TransCompute<phi::CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans);
+    funcs::TransCompute<phi::CPUContext, T>(
+        ndims, dev_ctx, tmp_out, output, trans);
+    if (!keepdim) {
+      output->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
new file mode 100644
index 0000000000000..cee48ed96db1c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
@@ -0,0 +1,186 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__)
+#include "paddle/fluid/operators/jit/kernels.h"
+#endif
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         paddle::optional<const DenseTensor&> scale_opt,
+                         paddle::optional<const DenseTensor&> bias_opt,
+                         const DenseTensor& out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad) {
+  auto* scale = scale_opt.get_ptr();
+  auto d_y = out_grad;
+
+  // init output
+  auto* d_x = x_grad;
+  auto* d_scale = scale_grad;
+  auto* d_bias = bias_grad;
+
+  const auto& x_dims = x.dims();
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  DDim matrix_shape({left, right});
+
+  d_y.Resize(matrix_shape);
+
+  funcs::ColwiseSum2D<phi::CPUContext, T> colwise_sum(left, right, dev_ctx);
+  DenseTensor x_tmp = x;
+
+  DenseTensor temp;
+  DenseTensor temp_norm;
+  if (d_scale || d_x) {
+    x_tmp.Resize(matrix_shape);
+    temp.Resize(matrix_shape);
+    dev_ctx.template Alloc<T>(&temp);
+
+    temp_norm.Resize(matrix_shape);
+    dev_ctx.template Alloc<T>(&temp_norm);
+    // get x_norm
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+        dev_ctx,
+        x_tmp,
+        mean,
+        /*axis*/ 0,
+        funcs::SubtractFunctor<T>(),
+        &temp_norm);
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+        dev_ctx,
+        temp_norm,
+        variance,
+        /*axis*/ 0,
+        funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+        &temp_norm);
+  }
+
+  if (d_bias) {
+    dev_ctx.template Alloc<T>(d_bias);
+    colwise_sum(dev_ctx, d_y, d_bias);
+  }
+  if (d_scale) {
+    dev_ctx.template Alloc<T>(d_scale);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx, temp_norm, d_y, 0, funcs::MultiplyFunctor<T>(), &temp);
+    colwise_sum(dev_ctx, temp, d_scale);
+  }
+
+  if (d_x) {
+    DDim vec_shape({left});
+    dev_ctx.template Alloc<T>(d_x);
+    auto dx_dim = d_x->dims();
+    DenseTensor temp_vec;
+    temp_vec.Resize(vec_shape);
+    dev_ctx.template Alloc<T>(&temp_vec);
+
+    funcs::RowwiseMean2D<phi::CPUContext, T> row_mean(left, right, dev_ctx);
+
+    if (d_scale) {
+      // dy_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx, d_y, *scale, /*axis*/ 1, funcs::MultiplyFunctor<T>(), &temp);
+      phi::Copy<Context>(dev_ctx, temp, dev_ctx.GetPlace(), false, d_x);
+
+      // dy_dmean_dx
+      row_mean(dev_ctx, temp, &temp_vec);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+          dev_ctx,
+          *d_x,
+          temp_vec,
+          /*axis*/ 0,
+          funcs::SubtractFunctor<T>(),
+          d_x);
+
+      // dy_var_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx,
+          temp,
+          temp_norm,
+          /*axis*/ 0,
+          funcs::MultiplyFunctor<T>(),
+          &temp);
+    } else {
+      // dy_dx
+      phi::Copy<Context>(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x);
+
+      // dy_dmean_dx
+      row_mean(dev_ctx, d_y, &temp_vec);
+      phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+          dev_ctx,
+          *d_x,
+          temp_vec,
+          /*axis*/ 0,
+          funcs::SubtractFunctor<T>(),
+          d_x);
+
+      // dy_var_dx
+      phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+          dev_ctx,
+          d_y,
+          temp_norm,
+          /*axis*/ 0,
+          funcs::MultiplyFunctor<T>(),
+          &temp);
+    }
+    // dy_var_dx
+    row_mean(dev_ctx, temp, &temp_vec);
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx,
+        temp_norm,
+        temp_vec,
+        /*axis*/ 0,
+        funcs::MultiplyFunctor<T>(),
+        &temp);
+    phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+        dev_ctx, *d_x, temp, /*axis*/ 0, funcs::SubtractFunctor<T>(), d_x);
+
+    phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+        dev_ctx,
+        *d_x,
+        variance,
+        /*axis*/ 0,
+        funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+        d_x);
+    d_x->Resize(dx_dim);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    layer_norm_grad, CPU, ALL_LAYOUT, phi::LayerNormGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
new file mode 100644
index 0000000000000..5b09d68c7ca08
--- /dev/null
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
+    !defined(__OSX__)
+#include "paddle/fluid/operators/jit/kernels.h"
+#endif
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     paddle::optional<const DenseTensor&> scale_opt,
+                     paddle::optional<const DenseTensor&> bias_opt,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor* y,
+                     DenseTensor* mean,
+                     DenseTensor* var) {
+  const auto x_dims = x.dims();
+  auto* scale = scale_opt.get_ptr();
+  auto* bias = bias_opt.get_ptr();
+
+  dev_ctx.template Alloc<T>(y);
+  dev_ctx.template Alloc<T>(mean);
+  dev_ctx.template Alloc<T>(var);
+
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  DDim matrix_shape({left, right});
+
+  auto x_tmp = x;
+  x_tmp.Resize(matrix_shape);
+  DenseTensor out;
+  out.ShareDataWith(*y);
+  out.Resize(matrix_shape);
+
+#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \
+    defined(__OSX__)
+
+  funcs::RowwiseMean2D<phi::CPUContext, T> row_mean(left, right, dev_ctx);
+
+  // get mean
+  row_mean(dev_ctx, x_tmp, mean);
+
+  // get variance
+
+  phi::funcs::ElementwiseCompute<funcs::SubAndSquareFunctor<T>, T, T>(
+      dev_ctx, x_tmp, *mean, 0, funcs::SubAndSquareFunctor<T>(), &out);
+
+  row_mean(dev_ctx, out, var);
+
+  // get x_norm
+  phi::funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T, T>(
+      dev_ctx, x_tmp, *mean, 0, funcs::SubtractFunctor<T>(), &out);
+
+  phi::funcs::ElementwiseCompute<funcs::DivAndSqrtFunctor<T>, T, T>(
+      dev_ctx,
+      out,
+      *var,
+      0,
+      funcs::DivAndSqrtFunctor<T>(static_cast<T>(epsilon)),
+      &out);
+
+  if (scale) {
+    phi::funcs::ElementwiseCompute<funcs::MultiplyFunctor<T>, T, T>(
+        dev_ctx, out, *scale, 1, funcs::MultiplyFunctor<T>(), &out);
+  }
+  if (bias) {
+    phi::funcs::ElementwiseCompute<funcs::AddFunctor<T>, T, T>(
+        dev_ctx, out, *bias, 1, funcs::AddFunctor<T>(), &out);
+  }
+#else
+  PADDLE_ENFORCE_EQ(mean->numel(),
+                    left,
+                    phi::errors::InvalidArgument(
+                        "mean's length (%d) is not equal with expected (%d).",
+                        mean->numel(),
+                        left));
+  PADDLE_ENFORCE_EQ(var->numel(),
+                    left,
+                    phi::errors::InvalidArgument(
+                        "var's length (%d) is not equal with expected (%d).",
+                        var->numel(),
+                        left));
+  if (scale) {
+    PADDLE_ENFORCE_EQ(
+        scale->numel(),
+        right,
+        phi::errors::InvalidArgument(
+            "scale's length (%d) is not equal with expected (%d).",
+            scale->numel(),
+            right));
+  }
+  if (bias) {
+    PADDLE_ENFORCE_EQ(bias->numel(),
+                      right,
+                      phi::errors::InvalidArgument(
+                          "bias's length (%d) is not equal with expected (%d).",
+                          bias->numel(),
+                          right));
+  }
+
+  auto ker = paddle::operators::jit::KernelFuncs<
+                 paddle::operators::jit::LayerNormTuple<T>,
+                 phi::CPUPlace>::Cache()
+                 .At(right);
+  ker(x_tmp.data<T>(),
+      out.data<T>(),
+      mean->data<T>(),
+      var->data<T>(),
+      scale ? scale->data<T>() : nullptr,
+      bias ? bias->data<T>() : nullptr,
+      static_cast<int>(left),
+      static_cast<const float>(epsilon),
+      right);
+#endif
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    layer_norm, CPU, ALL_LAYOUT, phi::LayerNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
new file mode 100644
index 0000000000000..116fa3f8d3f6a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    lgamma_grad, CPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lgamma_kernel.cc b/paddle/phi/kernels/cpu/lgamma_kernel.cc
new file mode 100644
index 0000000000000..f849322174d29
--- /dev/null
+++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_kernel.h"
+
+#include <unsupported/Eigen/SpecialFunctions>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+template <typename T>
+struct LgammaFunctor {
+  LgammaFunctor(const T* input, T* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = Eigen::numext::lgamma(input_[idx]);
+  }
+
+ private:
+  const T* input_;
+  T* output_;
+  int64_t numel_;
+};
+
+template <typename T, typename Context>
+void LgammaKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) {
+  auto numel = x.numel();
+  auto* x_data = x.data<T>();
+  auto* out_data = dev_ctx.template Alloc<T>(out);
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  LgammaFunctor<T> functor(x_data, out_data, numel);
+  for_range(functor);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(lgamma, CPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
new file mode 100644
index 0000000000000..5f344b9cc3fe0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrixTemplate = EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Context, typename T>
+struct LogSoftmaxGradFunctor {
+  void operator()(const Context& context,
+                  const DenseTensor* Y,
+                  const DenseTensor* dY,
+                  DenseTensor* dX,
+                  const int axis) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+
+    const int n = funcs::SizeToAxis(axis, Y->dims());
+    const int d = funcs::SizeFromAxis(axis, Y->dims());
+    phi::DDim dim_2d{n, d};
+
+    auto y = EigenMatrixTemplate<T>::From(*Y, dim_2d);
+    auto dy = EigenMatrixTemplate<T>::From(*dY, dim_2d);
+    auto dx = EigenMatrixTemplate<T>::From(*dX, dim_2d);
+
+    const int axis_dim = Y->dims()[axis];
+    const int batch_size = y.dimension(kBatchDim);
+    const int num_classes = y.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+
+    dx.device(*context.eigen_device()) =
+        dy -
+        (y.exp()) * (dy.reshape(batch_axis_remain)
+                         .sum(along_class)
+                         .broadcast(one_axis));
+  }
+};
+
+template <typename T, typename Context>
+void LogSoftmaxGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          int axis,
+                          DenseTensor* x_grad) {
+  const int rank = out.dims().size();
+  const int canonical_axis = funcs::CanonicalAxis(axis, rank);
+
+  dev_ctx.template Alloc<T>(x_grad);
+  if (out.numel() != 0) {
+    LogSoftmaxGradFunctor<Context, T>()(
+        dev_ctx, &out, &out_grad, x_grad, canonical_axis);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(log_softmax_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
new file mode 100644
index 0000000000000..241742378cc5d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrixTemplate = EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct ValueClip {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kThreshold = static_cast<T>(-64.);
+    return x < kThreshold ? kThreshold : x;
+  }
+};
+
+template <typename Context, typename T>
+struct LogSoftmaxFunctor {
+  void operator()(const Context& context,
+                  const DenseTensor* X,
+                  DenseTensor* Y,
+                  const int axis) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+    constexpr int kAxisDim = 1;
+
+    int axis_dim = X->dims()[axis];
+    const int n = funcs::SizeToAxis(axis, X->dims());
+    const int d = funcs::SizeFromAxis(axis, X->dims());
+    phi::DDim dim_2d{n, d};
+
+    auto logits = EigenMatrixTemplate<T>::From(*X, dim_2d);
+    auto log_softmax = EigenMatrixTemplate<T>::From(*Y, dim_2d);
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_axis(kAxisDim);
+    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
+    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+
+    // For numerical stability, logits should be shifted by maximum number along
+    // axis, calculate shifted_logits into log_softmax tensor for memory reuse.
+    if (num_remain == 1) {
+      // axis == -1, axis and class in same dimension, calculate along
+      // class dimension directly for higher performance
+      log_softmax.device(*context.eigen_device()) =
+          (logits -
+           logits.maximum(along_axis)
+               .eval()
+               .reshape(batch_by_one)
+               .broadcast(one_by_class))
+              .unaryExpr(ValueClip<T>());
+    } else {
+      // axis != -1, class dimension split into (axis, remain), max and sum
+      // should be calculated along axis dimension
+      log_softmax.device(*context.eigen_device()) =
+          (logits.reshape(batch_axis_remain) -
+           logits.reshape(batch_axis_remain)
+               .maximum(along_axis)
+               .eval()
+               .reshape(batch_one_remain)
+               .broadcast(one_axis_one)
+               .reshape(batch_classes))
+              .unaryExpr(ValueClip<T>());
+    }
+
+    log_softmax.device(*context.eigen_device()) =
+        log_softmax -
+        log_softmax.exp()
+            .eval()
+            .reshape(batch_axis_remain)
+            .sum(along_axis)
+            .log()
+            .broadcast(one_axis);
+  }
+};
+
+template <typename T, typename Context>
+void LogSoftmaxKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out) {
+  const int rank = x.dims().size();
+  const int canonical_axis = funcs::CanonicalAxis(axis, rank);
+
+  dev_ctx.template Alloc<T>(out);
+  if (x.numel() != 0) {
+    LogSoftmaxFunctor<Context, T>()(dev_ctx, &x, out, canonical_axis);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    log_softmax, CPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc
deleted file mode 100644
index 250f656926c05..0000000000000
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/math_kernel.h"
-
-#include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/scalar.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/elementwise.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/complex.h"
-
-namespace phi {
-
-#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
-  template <typename T, typename Context>                                   \
-  void name##RawKernel(const Context& dev_ctx,                              \
-                       const DenseTensor& x,                                \
-                       const DenseTensor& y,                                \
-                       int axis,                                            \
-                       DenseTensor* out) {                                  \
-    dev_ctx.template Alloc<T>(out);                                         \
-    if (x.dims() == y.dims()) {                                             \
-      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
-          dev_ctx, x, y, out);                                              \
-    } else {                                                                \
-      auto x_dims = x.dims();                                               \
-      auto y_dims = y.dims();                                               \
-      if (x_dims.size() >= y_dims.size()) {                                 \
-        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
-            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
-      } else {                                                              \
-        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
-            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
-      }                                                                     \
-    }                                                                       \
-  }
-
-template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<CPUContext, T, phi::funcs::MeanFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out) {
-  phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
-    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
-        dev_ctx, x, y, out);
-  } else {
-    auto x_dims = x.dims();
-    auto y_dims = y.dims();
-    if (x_dims.size() >= y_dims.size()) {
-      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
-    } else {
-      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
-    }
-  }
-}
-
-// Create the definition of Add
-DEFINE_CPU_ELEMENTWISE_OP(Add)
-
-// Create the definition of Subtract
-DEFINE_CPU_ELEMENTWISE_OP(Subtract)
-
-// Create the definition of Multiply
-DEFINE_CPU_ELEMENTWISE_OP(Multiply)
-
-}  // namespace phi
-
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
-// using bfloat16 = ::phi::dtype::bfloat16;
-PD_REGISTER_KERNEL(add_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(sum_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SumRawKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-PD_REGISTER_KERNEL(
-    mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
index 70b6316e10444..ae1e406d16eec 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -17,13 +17,13 @@
 #include <Eigen/Dense>
 #include <Eigen/SVD>
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
-#include "paddle/phi/kernels/math_kernel.h"
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/mode_grad_kernel.cc b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
new file mode 100644
index 0000000000000..ca813c1757eac
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
@@ -0,0 +1,170 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad) {
+  auto in_dims = x.dims();
+  auto out_dims = indices.dims();
+
+  // axis < 0, get the real axis
+  axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+  if (!keepdim) {
+    std::vector<int> tmp_out_shape;
+    for (int i = 0; i < axis; i++) {
+      tmp_out_shape.emplace_back(out_dims[i]);
+    }
+    tmp_out_shape.emplace_back(1);
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      tmp_out_shape.emplace_back(out_dims[i - 1]);
+    }
+    out_dims = phi::make_ddim(tmp_out_shape);
+  }
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+
+  if (axis == in_dims.size() - 1) {
+    // allocate the memory for the input_grad
+    // assign the out_grad to input_grad directly
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t input_width = in_dims[in_dims.size() - 1];
+
+    // init the output grad with 0, because some input elements has no grad
+    memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+    // Assign the output_grad to input_grad
+    if (keepdim) {
+      funcs::ModeAssign(input_height,
+                        input_width,
+                        in_dims.size(),
+                        &out_grad,
+                        &indices,
+                        x_grad_data);
+    } else {
+      DenseTensor out_grad_tmp;
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+      DenseTensor indices_tmp;
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+
+      funcs::ModeAssign(input_height,
+                        input_width,
+                        in_dims.size(),
+                        &out_grad_tmp,
+                        &indices_tmp,
+                        x_grad_data);
+    }
+  } else {
+    // can not assign grad to input_grad, must do the transpose
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(out_dims.size() - 1);
+    for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+    DDim trans_shape(out_dims);
+    DDim trans_in_shape(in_dims);
+    for (size_t i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = out_dims[trans_axis[i]];
+      trans_in_shape[i] = in_dims[trans_axis[i]];
+    }
+    // transpose the out_grad, indices
+    DenseTensor trans_dO;
+    trans_dO.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_dO);
+
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_shape);
+    dev_ctx.template Alloc<int64_t>(&trans_ind);
+
+    int ndims = trans_axis.size();
+
+    if (keepdim) {
+      // Do transpose
+      funcs::TransCompute<CPUContext, T>(
+          ndims, dev_ctx, out_grad, &trans_dO, trans_axis);
+      funcs::TransCompute<CPUContext, int64_t>(
+          ndims, dev_ctx, indices, &trans_ind, trans_axis);
+    } else {
+      DenseTensor out_grad_tmp;
+      dev_ctx.template Alloc<T>(&out_grad_tmp);
+
+      DenseTensor indices_tmp;
+      dev_ctx.template Alloc<int64_t>(&indices_tmp);
+
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp);
+      phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp);
+      out_grad_tmp.Resize(out_dims);
+      indices_tmp.Resize(out_dims);
+      // Do transpose
+      funcs::TransCompute<CPUContext, T>(
+          ndims, dev_ctx, out_grad_tmp, &trans_dO, trans_axis);
+      funcs::TransCompute<CPUContext, int64_t>(
+          ndims, dev_ctx, indices_tmp, &trans_ind, trans_axis);
+    }
+    const int64_t input_height = phi::product(
+        phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
+    const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
+
+    // Assign the out_grad to tranpose input_grad
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_in_shape);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+    memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+    funcs::ModeAssign<T, int64_t>(input_height,
+                                  input_width,
+                                  in_dims.size(),
+                                  &trans_dO,
+                                  &trans_ind,
+                                  t_out);
+
+    // Transpose back
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, tmp_out, x_grad, trans_axis);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mode_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ModeGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/mode_kernel.cc b/paddle/phi/kernels/cpu/mode_kernel.cc
new file mode 100644
index 0000000000000..6535d1b89af42
--- /dev/null
+++ b/paddle/phi/kernels/cpu/mode_kernel.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  auto out_dims = out->dims();
+  // axis < 0, cacluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  // if axis is not the last dim, transpose it to the last dim, do the
+  // calculation, then tranpose it back to original axis.
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    funcs::GetMode<T, int64_t>(input_height,
+                               input_width,
+                               in_dims.size(),
+                               &x,
+                               output_data,
+                               indices_data);
+  } else {
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.push_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      out->Resize(tmp_out_dim);
+      indices->Resize(tmp_out_dim);
+    }
+
+    // get the trans input_dims, out_dims
+    DDim trans_shape(in_dims);
+    DDim trans_out_shape(in_dims);
+
+    for (size_t i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = in_dims[trans_axis[i]];
+      trans_out_shape[i] = in_dims[trans_axis[i]];
+    }
+    trans_out_shape[in_dims.size() - 1] = 1;
+
+    DenseTensor trans_input;
+    trans_input.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_input);
+    int ndims = trans_axis.size();
+
+    // transpose the input value
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, x, &trans_input, trans_axis);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_width = trans_shape[trans_shape.size() - 1];
+    DenseTensor tmp_out;
+    tmp_out.Resize(trans_out_shape);
+    T* t_out = dev_ctx.template Alloc<T>(&tmp_out);
+
+    DenseTensor tmp_indices;
+    tmp_indices.Resize(trans_out_shape);
+    int64_t* t_ind = dev_ctx.template Alloc<int64_t>(&tmp_indices);
+
+    funcs::GetMode<T, int64_t>(
+        input_height, input_width, in_dims.size(), &trans_input, t_out, t_ind);
+    // transpose back
+    funcs::TransCompute<CPUContext, int64_t>(
+        ndims, dev_ctx, tmp_indices, indices, trans_axis);
+    funcs::TransCompute<CPUContext, T>(
+        ndims, dev_ctx, tmp_out, out, trans_axis);
+    if (!keepdim) {
+      out->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    mode, CPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
new file mode 100644
index 0000000000000..f5a426e93db2c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_grad_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexGradKernel(const Context& ctx,
+                         const DenseTensor& ids,
+                         const DenseTensor& out_grad,
+                         std::vector<DenseTensor*> ins_grad) {
+  size_t idx = -1UL;
+  for (size_t i = 0; i < ins_grad.size(); i++) {
+    if (ins_grad[i]) {
+      ctx.template Alloc<T>(ins_grad[i]);
+      auto t = phi::EigenVector<T>::Flatten(*ins_grad[i]);
+      t.device(*ctx.eigen_device()) = t.constant(static_cast<T>(0));
+      idx = i;
+    }
+  }
+  if (idx == -1UL) return;
+
+  auto rows = ins_grad[idx]->dims()[0];
+  auto cols = ins_grad[idx]->numel() / rows;
+  auto* index = ids.data<int32_t>();
+  for (auto i = 0; i < rows; i++) {
+    size_t k = static_cast<size_t>(index[i]);
+    if (ins_grad[k]) {
+      paddle::memory::Copy(ctx.GetPlace(),
+                           ins_grad[k]->data<T>() + i * cols,
+                           ctx.GetPlace(),
+                           out_grad.data<T>() + i * cols,
+                           cols * sizeof(T));
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/multiplex_kernel.cc b/paddle/phi/kernels/cpu/multiplex_kernel.cc
new file mode 100644
index 0000000000000..2d9f4c51a981e
--- /dev/null
+++ b/paddle/phi/kernels/cpu/multiplex_kernel.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexKernel(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const DenseTensor& ids,
+                     DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  for (size_t i = 0; i < ins.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        ins[i]->numel(),
+        0,
+        errors::OutOfRange(
+            "indexing will be out of bounds with size 0 for the %d-th input.",
+            i));
+  }
+  auto rows = ins[0]->dims()[0];
+  auto cols = ins[0]->numel() / rows;
+  auto index = ids.data<int32_t>();
+  for (auto i = 0; i < rows; i++) {
+    int32_t k = index[i];
+    PADDLE_ENFORCE_GE(
+        k, 0, errors::PreconditionNotMet("index must be nonnegative."));
+    PADDLE_ENFORCE_LT(static_cast<size_t>(k),
+                      ins.size(),
+                      errors::PreconditionNotMet(
+                          "index exceeds the number of candidate tensors."));
+    paddle::memory::Copy(ctx.GetPlace(),
+                         out->data<T>() + i * cols,
+                         ctx.GetPlace(),
+                         ins[k]->data<T>() + i * cols,
+                         cols * sizeof(T));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/fluid/operators/one_hot_v2_op.h b/paddle/phi/kernels/cpu/one_hot_kernel.cc
similarity index 50%
rename from paddle/fluid/operators/one_hot_v2_op.h
rename to paddle/phi/kernels/cpu/one_hot_kernel.cc
index 9d42c5875bb6e..dc58489ebf70e 100644
--- a/paddle/fluid/operators/one_hot_v2_op.h
+++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,23 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/one_hot_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
+namespace phi {
 
 template <typename DeviceContext, typename InT>
 struct OneHotV2OpFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
+  const DenseTensor* in_;
+  DenseTensor* out_;
   int depth_;
   const DeviceContext& ctx_;
   bool allow_out_of_range_;
 
-  OneHotV2OpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
-                    int depth, const DeviceContext& ctx,
+  OneHotV2OpFunctor(const DenseTensor* in,
+                    DenseTensor* out,
+                    int depth,
+                    const DeviceContext& ctx,
                     bool allow_out_of_range = false)
       : in_(in),
         out_(out),
@@ -40,8 +42,8 @@ struct OneHotV2OpFunctor {
   void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    phi::funcs::set_constant(ctx_, out_, 0.0);
+    auto* p_out_data = ctx_.template Alloc<OutT>(out_);
+    funcs::set_constant(ctx_, out_, 0.0);
 
     if (allow_out_of_range_) {
       for (int i = 0; i < numel; ++i) {
@@ -52,51 +54,46 @@ struct OneHotV2OpFunctor {
     } else {
       for (int i = 0; i < numel; ++i) {
         PADDLE_ENFORCE_GE(
-            p_in_data[i], 0,
-            platform::errors::InvalidArgument(
+            p_in_data[i],
+            0,
+            phi::errors::InvalidArgument(
                 "Illegal index value, Input(input) value should be at least 0, "
                 "but received input (%d) less than 0",
                 p_in_data[i]));
         PADDLE_ENFORCE_LT(
-            p_in_data[i], depth_,
-            platform::errors::InvalidArgument(
+            p_in_data[i],
+            depth_,
+            phi::errors::InvalidArgument(
                 "Illegal index value, Input(input) value should be less than "
                 "Input(depth), "
                 "but received input (%d) not less than depth (%d)",
-                p_in_data[i], depth_));
+                p_in_data[i],
+                depth_));
         *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
       }
     }
   }
 };
 
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-template <typename DeviceContext, typename T>
-class OneHotV2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    int depth = context.Attr<int>("depth");
-    bool allow_out_of_range = context.Attr<bool>("allow_out_of_range");
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<Tensor>("depth_tensor");
-      auto* depth_data = depth_tensor->data<int32_t>();
-      depth = depth_data[0];
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("dtype")),
-        OneHotV2OpFunctor<DeviceContext, T>(
-            in, out, depth, context.template device_context<DeviceContext>(),
-            allow_out_of_range));
+template <typename T, typename Context>
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out) {
+  auto out_dims = out->dims();
+  if (out_dims[out_dims.size() - 1] == -1) {
+    out_dims[out_dims.size() - 1] = depth;
+    out->Resize(out_dims);
   }
-};
 
-}  // namespace operators
-}  // namespace paddle
+  phi::VisitDataType(dtype,
+                     OneHotV2OpFunctor<Context, T>(
+                         &x, out, depth, dev_ctx, allow_out_of_range));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    one_hot_raw, CPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc b/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc
new file mode 100644
index 0000000000000..b1adb3e206da9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc
@@ -0,0 +1,480 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+void ConstPad3DGradNCDHW(T* d_in_data,
+                         const T* d_out_data,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] =
+        d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+void ConstPad3DGradNDHWC(T* d_in_data,
+                         const T* d_out_data,
+                         const int channels,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (!(in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+        in_h >= in_height || in_w >= in_width)) {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      d_in_data[in_index + c] = d_out_data[out_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DGradNCDHW(T* d_in_data,
+                           const T* d_out_data,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReflectPad3DGradNDHWC(T* d_in_data,
+                           const T* d_out_data,
+                           const int channels,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DGradNCDHW(T* d_in_data,
+                             const T* d_out_data,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void ReplicatePad3DGradNDHWC(T* d_in_data,
+                             const T* d_out_data,
+                             const int channels,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DGradNCDHW(T* d_in_data,
+                            const T* d_out_data,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+  d_in_data[in_d * in_height * in_width + in_h * in_width + in_w] +=
+      d_out_data[out_d * out_height * out_width + out_h * out_width + out_w];
+}
+
+template <typename T>
+void CircularPad3DGradNDHWC(T* d_in_data,
+                            const T* d_out_data,
+                            const int channels,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    d_in_data[in_index + c] += d_out_data[out_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DGradNCDHW(T* d_in_data,
+                    const int num,
+                    const int channels,
+                    const int in_depth,
+                    const int in_height,
+                    const int in_width,
+                    const int out_depth,
+                    const int out_height,
+                    const int out_width,
+                    const int pad_front,
+                    const int pad_top,
+                    const int pad_left,
+                    const T* d_out_data,
+                    void (*pad_func)(T*,
+                                     const T*,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(d_in_data,
+                     d_out_data,
+                     in_depth,
+                     in_height,
+                     in_width,
+                     out_depth,
+                     out_height,
+                     out_width,
+                     pad_front,
+                     pad_top,
+                     pad_left,
+                     out_d,
+                     out_h,
+                     out_w);
+          }
+        }
+      }
+      d_in_data += in_depth * in_height * in_width;
+      d_out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DGradNDHWC(T* d_in_data,
+                    const int num,
+                    const int channels,
+                    const int in_depth,
+                    const int in_height,
+                    const int in_width,
+                    const int out_depth,
+                    const int out_height,
+                    const int out_width,
+                    const int pad_front,
+                    const int pad_top,
+                    const int pad_left,
+                    const T* d_out_data,
+                    void (*pad_func)(T*,
+                                     const T*,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(d_in_data,
+                   d_out_data,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   out_d,
+                   out_h,
+                   out_w);
+        }
+      }
+    }
+    d_in_data += in_depth * in_height * in_width * channels;
+    d_out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& out_grad,
+                     const ScalarArray& paddings,
+                     const std::string& mode,
+                     float pad_value,
+                     const std::string& data_format,
+                     DenseTensor* x_grad) {
+  std::vector<int64_t> pads = paddings.GetData();
+
+  auto* d_out = &out_grad;
+  auto* d_in = x_grad;
+  auto d_in_dims = d_in->dims();
+  auto d_out_dims = d_out->dims();
+  const T* d_out_data = d_out->data<T>();
+  T* d_in_data = dev_ctx.template Alloc<T>(d_in);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, d_in, static_cast<T>(0));
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+  const int num = d_in_dims[0];
+  if (data_format == "NCDHW") {
+    const int channels = d_in_dims[1];
+    const int in_depth = d_in_dims[2];
+    const int in_height = d_in_dims[3];
+    const int in_width = d_in_dims[4];
+    const int out_depth = d_out_dims[2];
+    const int out_height = d_out_dims[3];
+    const int out_width = d_out_dims[4];
+
+    std::map<std::string,
+             void (*)(T*,
+                      const T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DGradNCDHW;
+    func_map["replicate"] = ReplicatePad3DGradNCDHW;
+    func_map["circular"] = CircularPad3DGradNCDHW;
+    func_map["constant"] = ConstPad3DGradNCDHW;
+
+    Pad3DGradNCDHW(d_in_data,
+                   num,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   d_out_data,
+                   func_map[mode]);
+  } else {
+    const int channels = d_in_dims[4];
+    const int in_depth = d_in_dims[1];
+    const int in_height = d_in_dims[2];
+    const int in_width = d_in_dims[3];
+    const int out_depth = d_out_dims[1];
+    const int out_height = d_out_dims[2];
+    const int out_width = d_out_dims[3];
+
+    std::map<std::string,
+             void (*)(T*,
+                      const T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DGradNDHWC;
+    func_map["replicate"] = ReplicatePad3DGradNDHWC;
+    func_map["circular"] = CircularPad3DGradNDHWC;
+    func_map["constant"] = ConstPad3DGradNDHWC;
+
+    Pad3DGradNDHWC(d_in_data,
+                   num,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   d_out_data,
+                   func_map[mode]);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    pad3d_grad, CPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/pad3d_kernel.cc b/paddle/phi/kernels/cpu/pad3d_kernel.cc
new file mode 100644
index 0000000000000..68bd92168364d
--- /dev/null
+++ b/paddle/phi/kernels/cpu/pad3d_kernel.cc
@@ -0,0 +1,587 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+void ConstPad3DFuncNCDHW(const T* in_data,
+                         T* out_data,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w,
+                         const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+       in_h >= in_height || in_w >= in_width)
+          ? value
+          : in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ConstPad3DFuncNDHWC(const T* in_data,
+                         T* out_data,
+                         const int channels,
+                         const int in_depth,
+                         const int in_height,
+                         const int in_width,
+                         const int out_depth,
+                         const int out_height,
+                         const int out_width,
+                         const int pad_front,
+                         const int pad_top,
+                         const int pad_left,
+                         const int out_d,
+                         const int out_h,
+                         const int out_w,
+                         const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  if (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+      in_h >= in_height || in_w >= in_width) {
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = value;
+    }
+  } else {
+    const int in_index =
+        (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+    for (int c = 0; c < channels; ++c) {
+      out_data[out_index + c] = in_data[in_index + c];
+    }
+  }
+}
+
+template <typename T>
+void ReflectPad3DFuncNCDHW(const T* in_data,
+                           T* out_data,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w,
+                           const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);                     // reflect by 0
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+  in_h = std::max(in_h, -in_h);                     // reflect by 0
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+  in_w = std::max(in_w, -in_w);                     // reflect by 0
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReflectPad3DFuncNDHWC(const T* in_data,
+                           T* out_data,
+                           const int channels,
+                           const int in_depth,
+                           const int in_height,
+                           const int in_width,
+                           const int out_depth,
+                           const int out_height,
+                           const int out_width,
+                           const int pad_front,
+                           const int pad_top,
+                           const int pad_left,
+                           const int out_d,
+                           const int out_h,
+                           const int out_w,
+                           const T value) {
+  int in_d = out_d - pad_front;
+  int in_h = out_h - pad_top;
+  int in_w = out_w - pad_left;
+
+  in_d = std::max(in_d, -in_d);
+  in_d = std::min(in_d, 2 * in_depth - in_d - 2);
+  in_h = std::max(in_h, -in_h);
+  in_h = std::min(in_h, 2 * in_height - in_h - 2);
+  in_w = std::max(in_w, -in_w);
+  in_w = std::min(in_w, 2 * in_width - in_w - 2);
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void ReplicatePad3DFuncNCDHW(const T* in_data,
+                             T* out_data,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w,
+                             const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void ReplicatePad3DFuncNDHWC(const T* in_data,
+                             T* out_data,
+                             const int channels,
+                             const int in_depth,
+                             const int in_height,
+                             const int in_width,
+                             const int out_depth,
+                             const int out_height,
+                             const int out_width,
+                             const int pad_front,
+                             const int pad_top,
+                             const int pad_left,
+                             const int out_d,
+                             const int out_h,
+                             const int out_w,
+                             const T value) {
+  int in_d = std::min(in_depth - 1, std::max(out_d - pad_front, 0));
+  int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+  int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void CircularPad3DFuncNCDHW(const T* in_data,
+                            T* out_data,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w,
+                            const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  out_data[out_d * out_height * out_width + out_h * out_width + out_w] =
+      in_data[in_d * in_height * in_width + in_h * in_width + in_w];
+}
+
+template <typename T>
+void CircularPad3DFuncNDHWC(const T* in_data,
+                            T* out_data,
+                            const int channels,
+                            const int in_depth,
+                            const int in_height,
+                            const int in_width,
+                            const int out_depth,
+                            const int out_height,
+                            const int out_width,
+                            const int pad_front,
+                            const int pad_top,
+                            const int pad_left,
+                            const int out_d,
+                            const int out_h,
+                            const int out_w,
+                            const T value) {
+  int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+  int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+  int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+  const int out_index =
+      (out_d * out_height * out_width + out_h * out_width + out_w) * channels;
+  const int in_index =
+      (in_d * in_height * in_width + in_h * in_width + in_w) * channels;
+  for (int c = 0; c < channels; ++c) {
+    out_data[out_index + c] = in_data[in_index + c];
+  }
+}
+
+template <typename T>
+void Pad3DNCDHW(const T* in_data,
+                const int num,
+                const int channels,
+                const int in_depth,
+                const int in_height,
+                const int in_width,
+                const int out_depth,
+                const int out_height,
+                const int out_width,
+                const int pad_front,
+                const int pad_top,
+                const int pad_left,
+                T value,
+                T* out_data,
+                void (*pad_func)(const T*,
+                                 T*,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_d = 0; out_d < out_depth; ++out_d) {
+        for (int out_h = 0; out_h < out_height; ++out_h) {
+          for (int out_w = 0; out_w < out_width; ++out_w) {
+            pad_func(in_data,
+                     out_data,
+                     in_depth,
+                     in_height,
+                     in_width,
+                     out_depth,
+                     out_height,
+                     out_width,
+                     pad_front,
+                     pad_top,
+                     pad_left,
+                     out_d,
+                     out_h,
+                     out_w,
+                     value);
+          }
+        }
+      }
+      in_data += in_depth * in_height * in_width;
+      out_data += out_depth * out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad3DNDHWC(const T* in_data,
+                const int num,
+                const int channels,
+                const int in_depth,
+                const int in_height,
+                const int in_width,
+                const int out_depth,
+                const int out_height,
+                const int out_width,
+                const int pad_front,
+                const int pad_top,
+                const int pad_left,
+                T value,
+                T* out_data,
+                void (*pad_func)(const T*,
+                                 T*,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const T)) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_d = 0; out_d < out_depth; ++out_d) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          pad_func(in_data,
+                   out_data,
+                   channels,
+                   in_depth,
+                   in_height,
+                   in_width,
+                   out_depth,
+                   out_height,
+                   out_width,
+                   pad_front,
+                   pad_top,
+                   pad_left,
+                   out_d,
+                   out_h,
+                   out_w,
+                   value);
+        }
+      }
+    }
+    in_data += in_depth * in_height * in_width * channels;
+    out_data += out_depth * out_height * out_width * channels;
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const ScalarArray& paddings,
+                 const std::string& mode,
+                 float pad_value,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  T value = static_cast<T>(pad_value);
+  std::vector<int64_t> pads = paddings.GetData();
+
+  auto in_dims = x.dims();
+  const T* in_data = x.data<T>();
+
+  if (data_format == "NCDHW") {
+    out->Resize({in_dims[0],
+                 in_dims[1],
+                 in_dims[2] + pads[4] + pads[5],
+                 in_dims[3] + pads[2] + pads[3],
+                 in_dims[4] + pads[0] + pads[1]});
+  } else {
+    out->Resize({in_dims[0],
+                 in_dims[1] + pads[4] + pads[5],
+                 in_dims[2] + pads[2] + pads[3],
+                 in_dims[3] + pads[0] + pads[1],
+                 in_dims[4]});
+  }
+
+  auto out_dims = out->dims();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  int channels = in_dims[1];
+  int in_depth = in_dims[2];
+  int in_height = in_dims[3];
+  int in_width = in_dims[4];
+  int out_depth = out_dims[2];
+  int out_height = out_dims[3];
+  int out_width = out_dims[4];
+  if (data_format == "NDHWC") {
+    channels = in_dims[4];
+    in_depth = in_dims[1];
+    in_height = in_dims[2];
+    in_width = in_dims[3];
+    out_depth = out_dims[1];
+    out_height = out_dims[2];
+    out_width = out_dims[3];
+  }
+
+  if (mode == "reflect") {
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[4],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_front"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_front(%d).",
+                                in_depth,
+                                pads[4]));
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[5],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_back"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_back(%d).",
+                                in_depth,
+                                pads[5]));
+
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[2],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_top"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_top(%d).",
+                                in_height,
+                                pads[2]));
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[3],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_bottom"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_bottom(%d).",
+                                in_height,
+                                pads[3]));
+
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[0],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_left"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_left(%d).",
+                                in_width,
+                                pads[0]));
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[1],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_right"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_right(%d).",
+                                in_width,
+                                pads[1]));
+  } else if (mode == "circular" || mode == "replicate") {
+    PADDLE_ENFORCE_NE(in_depth * in_height * in_width,
+                      0,
+                      errors::InvalidArgument(
+                          "The input tensor size can not be 0 for circular "
+                          "or replicate padding mode."));
+  }
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+  const int num = in_dims[0];
+  if (data_format == "NCDHW") {
+    std::map<std::string,
+             void (*)(const T*,
+                      T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const T)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DFuncNCDHW;
+    func_map["replicate"] = ReplicatePad3DFuncNCDHW;
+    func_map["circular"] = CircularPad3DFuncNCDHW;
+    func_map["constant"] = ConstPad3DFuncNCDHW;
+    Pad3DNCDHW(in_data,
+               num,
+               channels,
+               in_depth,
+               in_height,
+               in_width,
+               out_depth,
+               out_height,
+               out_width,
+               pad_front,
+               pad_top,
+               pad_left,
+               value,
+               out_data,
+               func_map[mode]);
+  } else {
+    std::map<std::string,
+             void (*)(const T*,
+                      T*,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const int,
+                      const T)>
+        func_map;
+
+    func_map["reflect"] = ReflectPad3DFuncNDHWC;
+    func_map["replicate"] = ReplicatePad3DFuncNDHWC;
+    func_map["circular"] = CircularPad3DFuncNDHWC;
+    func_map["constant"] = ConstPad3DFuncNDHWC;
+    Pad3DNDHWC(in_data,
+               num,
+               channels,
+               in_depth,
+               in_height,
+               in_width,
+               out_depth,
+               out_height,
+               out_width,
+               pad_front,
+               pad_top,
+               pad_left,
+               value,
+               out_data,
+               func_map[mode]);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(pad3d,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Pad3dKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/prelu_grad_kernel.cc b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc
new file mode 100644
index 0000000000000..97558cdb31f66
--- /dev/null
+++ b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& alpha,
+                     const DenseTensor& out_grad,
+                     const std::string& mode,
+                     const std::string& data_format,
+                     DenseTensor* x_grad,
+                     DenseTensor* alpha_grad) {
+  const T* alpha_ptr = alpha.data<T>();
+  const T* x_ptr = x.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  int numel = x.numel();
+  auto dim = x.dims();
+  int index = 0;
+  int i = 0;
+  if (x_grad) {
+    T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+    if (mode == "channel") {
+      if (data_format == "NCHW") {
+        int temp = 1;
+        for (int j = 2; j < dim.size(); j++) {
+          temp *= dim[j];
+        }
+        for (i = 0; i < numel; i++) {
+          index = (i / temp) % dim[1];
+          x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i]
+                                       : alpha_ptr[index] * out_grad_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          index = i % dim[dim.size() - 1];
+          x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i]
+                                       : alpha_ptr[index] * out_grad_ptr[i];
+        }
+      }
+    } else if (mode == "element") {
+      int temp = 1;
+      for (int j = 1; j < dim.size(); j++) {
+        temp *= dim[j];
+      }
+      for (i = 0; i < numel; i++) {
+        index = i % temp;
+        x_grad_ptr[i] =
+            x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[index] * out_grad_ptr[i];
+      }
+    } else {
+      for (i = 0; i < numel; i++) {
+        x_grad_ptr[i] =
+            x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[0] * out_grad_ptr[i];
+      }
+    }
+  }
+
+  index = 0;
+  if (alpha_grad) {
+    T* alpha_grad_ptr = dev_ctx.template Alloc<T>(alpha_grad);
+    memset(alpha_grad_ptr, 0, sizeof(T) * alpha_grad->numel());
+
+    if (mode == "channel") {
+      if (data_format == "NCHW") {
+        int temp = 1;
+        for (int j = 2; j < dim.size(); j++) {
+          temp *= dim[j];
+        }
+        for (i = 0; i < numel; i++) {
+          index = (i / temp) % dim[1];
+          alpha_grad_ptr[index] +=
+              x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          index = i % dim[dim.size() - 1];
+          alpha_grad_ptr[index] +=
+              x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+        }
+      }
+    } else if (mode == "element") {
+      int temp = 1;
+      for (int j = 1; j < dim.size(); j++) {
+        temp *= dim[j];
+      }
+      for (i = 0; i < numel; i++) {
+        index = i % temp;
+        alpha_grad_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+      }
+    } else {
+      for (i = 0; i < numel; i++) {
+        alpha_grad_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    prelu_grad, CPU, ALL_LAYOUT, phi::PReluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/prelu_kernel.cc b/paddle/phi/kernels/cpu/prelu_kernel.cc
new file mode 100644
index 0000000000000..8f389ab9ff459
--- /dev/null
+++ b/paddle/phi/kernels/cpu/prelu_kernel.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& alpha,
+                 const std::string& mode,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  const T* x_ptr = x.data<T>();
+  const T* alpha_ptr = alpha.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+
+  int numel = x.numel();
+  auto dim = x.dims();
+  int index = 0;
+  int i = 0;
+  if (mode == "channel") {
+    if (data_format == "NCHW") {
+      int temp = 1;
+      for (int j = 2; j < dim.size(); j++) {
+        temp *= dim[j];
+      }
+      for (i = 0; i < numel; i++) {
+        index = (i / temp) % dim[1];
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+      }
+    } else {
+      for (i = 0; i < numel; i++) {
+        index = i % dim[dim.size() - 1];
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+      }
+    }
+  } else if (mode == "element") {
+    int temp = 1;
+    for (int j = 1; j < dim.size(); j++) {
+      temp *= dim[j];
+    }
+    for (i = 0; i < numel; i++) {
+      index = i % temp;
+      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+    }
+  } else {
+    for (i = 0; i < numel; i++) {
+      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(prelu, CPU, ALL_LAYOUT, phi::PReluKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/qr_kernel.cc b/paddle/phi/kernels/cpu/qr_kernel.cc
new file mode 100644
index 0000000000000..e2e32567441ae
--- /dev/null
+++ b/paddle/phi/kernels/cpu/qr_kernel.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Eigen/Dense>
+
+#include "paddle/phi/kernels/qr_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+
+namespace phi {
+
+static inline std::tuple<bool, bool> ParseQrMode(const std::string& mode) {
+  bool compute_q;
+  bool reduced;
+  if (mode == "reduced") {
+    compute_q = true;
+    reduced = true;
+  } else if (mode == "complete") {
+    compute_q = true;
+    reduced = false;
+  } else if (mode == "r") {
+    compute_q = false;
+    reduced = true;
+  } else {
+    PADDLE_THROW(errors::InvalidArgument(
+        "QR received unrecognized mode '%s'"
+        " but expected one of 'reduced' (default), 'r', or 'complete'",
+        mode));
+  }
+  return std::make_tuple(compute_q, reduced);
+}
+
+template <typename T, typename Context>
+void QrKernel(const Context& ctx,
+              const DenseTensor& x,
+              const std::string& mode,
+              DenseTensor* q,
+              DenseTensor* r) {
+  bool compute_q;
+  bool reduced_mode;
+  std::tie(compute_q, reduced_mode) = ParseQrMode(mode);
+  auto numel = x.numel();
+  PADDLE_ENFORCE_GT(
+      numel, 0, errors::PreconditionNotMet("The input of QR is empty."));
+  auto x_dims = x.dims();
+  int x_rank = x_dims.size();
+  int m = x_dims[x_rank - 2];
+  int n = x_dims[x_rank - 1];
+  int min_mn = std::min(m, n);
+  int k = reduced_mode ? min_mn : m;
+  int batch_size = numel / (m * n);
+  int x_stride = m * n;
+  int q_stride = m * k;
+  int r_stride = k * n;
+  auto* x_data = x.data<phi::dtype::Real<T>>();
+  T* q_data = nullptr;
+  if (compute_q) {
+    q_data = ctx.template Alloc<phi::dtype::Real<T>>(
+        q, batch_size * m * k * sizeof(phi::dtype::Real<T>));
+  }
+  auto* r_data = ctx.template Alloc<phi::dtype::Real<T>>(
+      r, batch_size * k * n * sizeof(phi::dtype::Real<T>));
+
+  // Implement QR by calling Eigen
+  for (int i = 0; i < batch_size; ++i) {
+    const T* x_matrix_ptr = x_data + i * x_stride;
+    T* r_matrix_ptr = r_data + i * r_stride;
+    using EigenDynamicMatrix =
+        Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    auto x_matrix = Eigen::Map<const EigenDynamicMatrix>(x_matrix_ptr, m, n);
+    Eigen::HouseholderQR<EigenDynamicMatrix> qr(x_matrix);
+    if (reduced_mode) {
+      auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n);
+      auto r_matrix_view =
+          qr_top_matrix.template triangularView<Eigen::Upper>();
+      auto r_matrix = EigenDynamicMatrix(r_matrix_view);
+      memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
+    } else {
+      auto r_matrix_view =
+          qr.matrixQR().template triangularView<Eigen::Upper>();
+      auto r_matrix = EigenDynamicMatrix(r_matrix_view);
+      memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T));
+    }
+
+    if (compute_q) {
+      T* q_matrix_ptr = q_data + i * q_stride;
+      if (reduced_mode) {
+        auto q_matrix =
+            qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn);
+        q_matrix.transposeInPlace();
+        memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
+      } else {
+        auto q_matrix = qr.householderQ() * EigenDynamicMatrix::Identity(m, m);
+        q_matrix.transposeInPlace();
+        memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T));
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(qr, CPU, ALL_LAYOUT, phi::QrKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h
index 4e268d40038cf..af67bdf5d624f 100644
--- a/paddle/phi/kernels/cpu/reduce.h
+++ b/paddle/phi/kernels/cpu/reduce.h
@@ -239,4 +239,29 @@ void Reduce(const DeviceContext& dev_ctx,
   }
 }
 
+template <typename DeviceContext, typename OutT, typename Functor>
+void BoolReduceKernel(const DeviceContext& dev_ctx,
+                      const phi::DenseTensor& input,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      phi::DenseTensor* output) {
+  dev_ctx.template Alloc<OutT>(output);
+
+  // The dims has full dim, set the reduce_all is True
+  const auto& input_dim_size = input.dims().size();
+  std::set<int> dims_set(dims.begin(), dims.end());
+  bool full_dim = true;
+  for (auto i = 0; i < input_dim_size; i++) {
+    if (dims_set.find(i) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
+  }
+  reduce_all = (reduce_all || full_dim);
+
+  ReduceKernelImpl<DeviceContext, bool, OutT, Functor>(
+      dev_ctx, input, output, dims, keep_dim, reduce_all);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc
similarity index 53%
rename from paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
rename to paddle/phi/kernels/cpu/reduce_grad_kernel.cc
index efea054555e86..4b3b1fc16e9c4 100644
--- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc
@@ -12,33 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cast_kernel.h"
-#include "paddle/phi/kernels/cpu/reduce_grad.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h"
 namespace phi {
 
-struct SumGradFunctor {
-  template <typename DeviceContext,
-            typename X,
-            typename Y,
-            typename DX,
-            typename DY,
-            typename Dim>
-  void operator()(const DeviceContext& place,
-                  X* x,
-                  Y* y,
-                  DX* dx,
-                  DY* dy,
-                  const Dim& dim,
-                  int size) {
-    dx->device(place) = dy->broadcast(dim);
-  }
-};
-
 template <typename T, typename Context>
 void ComputeFromInput(const Context& dev_ctx,
                       const DenseTensor& x,
@@ -111,16 +97,38 @@ void ReduceSumGradKernel(const Context& dev_ctx,
     }
   }
 
-  ReduceGradKernel<Context, T, SumGradFunctor, true>(dev_ctx,
-                                                     x,
-                                                     out_grad,
-                                                     paddle::none,
-                                                     dims,
-                                                     keep_dim,
-                                                     reduce_all,
-                                                     in_dtype,
-                                                     out_dtype,
-                                                     x_grad);
+  ReduceGradKernel<Context, T, funcs::SumGradFunctor, true>(dev_ctx,
+                                                            x,
+                                                            paddle::none,
+                                                            out_grad,
+                                                            dims,
+                                                            keep_dim,
+                                                            reduce_all,
+                                                            in_dtype,
+                                                            out_dtype,
+                                                            x_grad);
+}
+
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MeanGradFunctor, true>(dev_ctx,
+                                                             x,
+                                                             paddle::none,
+                                                             out_grad,
+                                                             dims,
+                                                             keep_dim,
+                                                             reduce_all,
+                                                             in_dtype,
+                                                             out_dtype,
+                                                             x_grad);
 }
 
 }  // namespace phi
@@ -137,3 +145,38 @@ PD_REGISTER_KERNEL(sum_grad,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(mean_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMeanGradKernel,
+                   bool,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(prod_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(max_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(min_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_kernel.cc b/paddle/phi/kernels/cpu/reduce_kernel.cc
new file mode 100644
index 0000000000000..bc99e2cb39a69
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_kernel.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MeanFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out) {
+  phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MinFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AllFunctor>(
+      dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  phi::BoolReduceKernel<CPUContext, T, phi::funcs::AnyFunctor>(
+      dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(sum_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SumRawKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+PD_REGISTER_KERNEL(
+    mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
+
+PD_REGISTER_KERNEL(prod_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ProdRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
+PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
diff --git a/paddle/phi/kernels/cpu/reverse_kernel.cc b/paddle/phi/kernels/cpu/reverse_kernel.cc
new file mode 100644
index 0000000000000..43eff7c055090
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reverse_kernel.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reverse_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/reverse_kernel_impl.h"
+
+PD_REGISTER_KERNEL(reverse,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReverseKernel,
+                   int,
+                   uint8_t,
+                   int64_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
new file mode 100644
index 0000000000000..a91b8b6c1fcd3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
@@ -0,0 +1,203 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <class T>
+void bilinear_interpolate_gradient(const int height,
+                                   const int width,
+                                   T y,
+                                   T x,
+                                   const T out_grad_this_bin,
+                                   const T count,
+                                   T* batch_grad_data) {
+  int x_low, y_low, x_high, y_high;
+  T w1, w2, w3, w4;
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    w1 = w2 = w3 = w4 = 0;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  y_low = static_cast<int>(y);
+  x_low = static_cast<int>(x);
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  T diff1 = out_grad_this_bin * w1 / count;
+  T diff2 = out_grad_this_bin * w2 / count;
+  T diff3 = out_grad_this_bin * w3 / count;
+  T diff4 = out_grad_this_bin * w4 / count;
+  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+    *(batch_grad_data + y_low * width + x_low) += diff1;
+    *(batch_grad_data + y_low * width + x_high) += diff2;
+    *(batch_grad_data + y_high * width + x_low) += diff3;
+    *(batch_grad_data + y_high * width + x_high) += diff4;
+  }
+}
+
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx) {
+  auto in_dims = x.dims();
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  if (!dx) {
+    return;
+  }
+
+  DenseTensor roi_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* box_batch_id_data = roi_batch_id_list.data<int>();
+
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    boxes_batch_size = boxes_lod.size() - 1;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+  dev_ctx.template Alloc<T>(dx);
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, dx, static_cast<T>(0));
+
+  int output_grad_size = out_grad.numel();
+
+  if ((!out_grad.IsInitialized()) || (output_grad_size <= 0)) {
+    return;
+  }
+
+  const T* boxes_data = boxes.data<T>();
+  const T* out_grad_data = out_grad.data<T>();
+  T* dx_data = dev_ctx.template Alloc<T>(dx);
+
+  auto in_stride = phi::stride(x.dims());
+  auto roi_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out_grad.dims());
+
+  T roi_offset = aligned ? T(0.5) : 0;
+  for (int n = 0; n < rois_num; ++n) {
+    int box_batch_idx = box_batch_id_data[n];
+    T roi_xmin = boxes_data[0] * spatial_scale - roi_offset;
+    T roi_ymin = boxes_data[1] * spatial_scale - roi_offset;
+    T roi_xmax = boxes_data[2] * spatial_scale - roi_offset;
+    T roi_ymax = boxes_data[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    roi_width = std::max(roi_width, static_cast<T>(1.));
+    roi_height = std::max(roi_height, static_cast<T>(1.));
+    if (!aligned) {
+      roi_width = std::max(roi_width, static_cast<T>(1.));
+      roi_height = std::max(roi_height, static_cast<T>(1.));
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    for (int c = 0; c < channels; ++c) {
+      T* batch_grad_data =
+          dx_data + box_batch_idx * in_stride[0] + c * in_stride[1];
+      const T* batch_out_grad_data =
+          out_grad_data + n * out_stride[0] + c * out_stride[1];
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int pool_index = ph * pooled_width + pw;
+          T out_grad_this_bin = batch_out_grad_data[pool_index];
+          int roi_bin_grid_h = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_height / pooled_height);
+          int roi_bin_grid_w = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_width / pooled_width);
+          T count = roi_bin_grid_h * roi_bin_grid_w;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const T y = roi_ymin + ph * bin_size_h +
+                        static_cast<T>(iy + .5f) * bin_size_h /
+                            static_cast<T>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const T x = roi_xmin + pw * bin_size_w +
+                          static_cast<T>(ix + .5f) * bin_size_w /
+                              static_cast<T>(roi_bin_grid_w);
+              bilinear_interpolate_gradient(height,
+                                            width,
+                                            y,
+                                            x,
+                                            out_grad_this_bin,
+                                            count,
+                                            batch_grad_data);
+            }
+          }
+        }
+      }
+    }
+    boxes_data += roi_stride[0];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roi_align_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RoiAlignGradKernel,
+                   float,
+                   double,
+                   int) {}
diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc
index 35ab99a98eba7..4752a9b3a48fd 100644
--- a/paddle/phi/kernels/cpu/roi_align_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
@@ -179,7 +179,7 @@ void AvgPool(const std::vector<T>& interpolated_values,
 }
 
 template <typename T, typename Context>
-void ROIAlignKernel(const Context& dev_ctx,
+void RoiAlignKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& boxes,
                     paddle::optional<const DenseTensor&> boxes_num,
@@ -315,4 +315,4 @@ void ROIAlignKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    roi_align, CPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double, int) {}
+    roi_align, CPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double, int) {}
diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
new file mode 100644
index 0000000000000..0eaa873590eb0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPoolGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& boxes,
+                       paddle::optional<const DenseTensor&> boxes_num,
+                       const DenseTensor& arg_max,
+                       const DenseTensor& out_grad,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       DenseTensor* dx) {
+  if (dx) {
+    int rois_num = boxes.dims()[0];
+    DenseTensor box_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+    int* box_batch_id_data = box_batch_id_list.data<int>();
+
+    int boxes_batch_size;
+    if (boxes_num) {
+      boxes_batch_size = boxes_num->numel();
+      auto* boxes_num_data = boxes_num->data<int>();
+      int start = 0;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (int i = start; i < start + boxes_num_data[n]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+        start += boxes_num_data[n];
+      }
+    } else {
+      auto boxes_lod = boxes.lod().back();
+      boxes_batch_size = boxes_lod.size() - 1;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+      }
+    }
+
+    const T* boxes_data = boxes.data<T>();
+    const T* out_grad_data = out_grad.data<T>();
+    const int64_t* arg_max_data = arg_max.data<int64_t>();
+    T* dx_data = dev_ctx.template Alloc<T>(dx);
+
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+
+    auto in_stride = phi::stride(x.dims());
+    auto arg_max_stride = phi::stride(arg_max.dims());
+    auto roi_stride = phi::stride(boxes.dims());
+    auto out_stride = phi::stride(out_grad.dims());
+
+    int channels = x.dims()[1];
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_idx = box_batch_id_data[n];
+      T* batch_grad_data = dx_data + roi_batch_idx * in_stride[0];
+      for (int c = 0; c < channels; ++c) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int pool_index = ph * pooled_width + pw;
+            if (arg_max_data[pool_index] >= 0) {
+              auto index = arg_max_data[pool_index];
+              batch_grad_data[index] += out_grad_data[pool_index];
+            }
+          }
+        }
+        batch_grad_data += in_stride[1];
+        out_grad_data += out_stride[1];
+        arg_max_data += arg_max_stride[1];
+      }
+      boxes_data += roi_stride[0];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roi_pool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RoiPoolGradKernel,
+                   float,
+                   double,
+                   int) {
+  kernel->InputAt(3).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
new file mode 100644
index 0000000000000..02020354cd357
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max) {
+  auto x_dims = x.dims();
+  int batch_size = x_dims[0];
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  auto in_stride = phi::stride(x_dims);
+  auto arg_max_stride = phi::stride(arg_max->dims());
+  auto box_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out->dims());
+
+  const T* input_data = x.data<T>();
+
+  DenseTensor box_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* box_batch_id_data = box_batch_id_list.data<int>();
+
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument("The boxes_batch_size and imgs "
+                                     "batch_size must be the same."));
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument("The boxes_batch_size and imgs "
+                                     "batch_size must be the same."));
+    int rois_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        rois_num_with_lod,
+        phi::errors::InvalidArgument("The rois_num from input "
+                                     "and lod must be the same."));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
+
+  const T* boxes_data = boxes.data<T>();
+  for (int n = 0; n < rois_num; ++n) {
+    int box_batch_id = box_batch_id_data[n];
+    int box_start_w = round(boxes_data[0] * spatial_scale);
+    int box_start_h = round(boxes_data[1] * spatial_scale);
+    int box_end_w = round(boxes_data[2] * spatial_scale);
+    int box_end_h = round(boxes_data[3] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    int box_height = std::max(box_end_h - box_start_h + 1, 1);
+    int box_width = std::max(box_end_w - box_start_w + 1, 1);
+
+    const float bin_size_h =
+        static_cast<float>(box_height) / static_cast<float>(pooled_height);
+    const float bin_size_w =
+        static_cast<float>(box_width) / static_cast<float>(pooled_width);
+
+    const T* batch_data = input_data + box_batch_id * in_stride[0];
+
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          //  Compute pooling region for this output unit:
+          //  start (included) = floor(ph * box_height / pooled_height_)
+          //  end (excluded) = ceil((ph + 1) * box_height / pooled_height_)
+          int hstart =
+              static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+          int wstart =
+              static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+          int hend =
+              static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+          int wend =
+              static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+          hstart = std::min(std::max(hstart + box_start_h, 0), height);
+          hend = std::min(std::max(hend + box_start_h, 0), height);
+          wstart = std::min(std::max(wstart + box_start_w, 0), width);
+          wend = std::min(std::max(wend + box_start_w, 0), width);
+
+          const int pool_index = ph * pooled_width + pw;
+
+          // Define an empty pooling region to be zero
+          bool is_empty = (hend <= hstart) || (wend <= wstart);
+          output_data[pool_index] =
+              is_empty ? 0 : -std::numeric_limits<T>::max();
+          arg_max_data[pool_index] = -1;
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              if (batch_data[index] > output_data[pool_index]) {
+                output_data[pool_index] = batch_data[index];
+                arg_max_data[pool_index] = index;
+              }
+            }
+          }
+        }
+      }
+
+      batch_data += in_stride[1];
+      output_data += out_stride[1];
+      arg_max_data += arg_max_stride[1];
+    }
+    // Increment ROI data pointer
+    boxes_data += box_stride[0];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool, CPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double, int) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/cpu/roll_grad_kernel.cc b/paddle/phi/kernels/cpu/roll_grad_kernel.cc
new file mode 100644
index 0000000000000..b0d0c0663e4a2
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roll_grad_kernel.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_grad_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad) {
+  std::vector<T> out_vec;
+  paddle::framework::TensorToVector(out_grad, dev_ctx, &out_vec);
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  DDim input_dim = out_grad.dims();
+  auto dims = axis;
+
+  // axis = none, reshape to 1-D tensor
+  if (dims.size() == 0) {
+    dims.push_back(0l);
+    input_dim = phi::Dim<1>(out_vec.size());
+  }
+
+  for (size_t i = 0; i < nums; i++) {
+    ShiftAlongDim(out_vec.data(), input_dim, dims[i], 0 - shifts_data[i]);
+  }
+
+  dev_ctx.template Alloc<T>(x_grad);
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, x_grad);
+  x_grad->Resize(out_grad.dims());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RollGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/roll_kernel.cc b/paddle/phi/kernels/cpu/roll_kernel.cc
new file mode 100644
index 0000000000000..25b64ef257dfb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roll_kernel.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/roll_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out) {
+  std::vector<T> out_vec;
+  paddle::framework::TensorToVector(x, dev_ctx, &out_vec);
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  DDim input_dim = x.dims();
+  auto dims = axis;
+
+  // axis = none, reshape to 1-D tensor
+  if (dims.size() == 0) {
+    dims.push_back(0l);
+    input_dim = phi::Dim<1>(out_vec.size());
+  }
+
+  for (size_t i = 0; i < nums; i++) {
+    PADDLE_ENFORCE_EQ(
+        dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()),
+        true,
+        phi::errors::OutOfRange(
+            "Attr(axis[%d]) is out of range, It's expected "
+            "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
+            i,
+            input_dim.size(),
+            input_dim.size() - 1,
+            i,
+            dims[i]));
+    ShiftAlongDim(out_vec.data(), input_dim, dims[i], shifts_data[i]);
+  }
+  dev_ctx.template Alloc<T>(out);
+  paddle::framework::TensorFromVector(out_vec, dev_ctx, out);
+  out->Resize(x.dims());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RollKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/roll_kernel_impl.h b/paddle/phi/kernels/cpu/roll_kernel_impl.h
new file mode 100644
index 0000000000000..924e71aff31f3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/roll_kernel_impl.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T>
+inline void ShiftAlongDim(T* data,
+                          const DDim& input_dim,
+                          int64_t dim,
+                          int64_t shift) {
+  if (dim < 0) {
+    dim += input_dim.size();
+  }
+  if (input_dim[dim] == 0) {
+    return;
+  }
+  shift = shift % input_dim[dim];
+  if (shift < 0) {
+    shift += input_dim[dim];
+  }
+
+  auto outer_loops = 1;
+  for (auto i = 0; i < dim; i++) {
+    outer_loops *= input_dim[i];
+  }
+  auto slice_width = 1;
+  for (auto i = dim + 1; i < input_dim.size(); i++) {
+    slice_width *= input_dim[i];
+  }
+
+  VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim
+          << "; dim: " << dim << "; shift: " << shift
+          << "; outer_loops: " << outer_loops
+          << "; slice_width: " << slice_width;
+  if (shift == 0) {
+    return;
+  }
+
+  std::vector<T> head;
+  auto head_size = slice_width * (input_dim[dim] - shift);
+  head.resize(head_size);
+
+  for (auto i = 0; i < outer_loops; i++) {
+    for (auto j = 0; j < head_size; j++) {
+      head[j] = data[i * input_dim[dim] * slice_width + j];
+    }
+    for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) {
+      auto dst_pos = j - input_dim[dim] + shift;
+      for (auto k = 0; k < slice_width; k++) {
+        data[(i * input_dim[dim] + dst_pos) * slice_width + k] =
+            data[(i * input_dim[dim] + j) * slice_width + k];
+      }
+    }
+    for (auto j = 0; j < head_size; j++) {
+      data[(i * input_dim[dim] + shift) * slice_width + j] = head[j];
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
index 585c27bdcec97..a5c9dc4c55e49 100644
--- a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc
@@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(segment_pool_grad,
                    ALL_LAYOUT,
                    phi::SegmentPoolGradKernel,
                    float,
-                   double) {}
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
index d0413457f8177..ad76a7a86bcb2 100644
--- a/paddle/phi/kernels/cpu/segment_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc
@@ -18,5 +18,11 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-PD_REGISTER_KERNEL(
-    segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
+PD_REGISTER_KERNEL(segment_pool,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
new file mode 100644
index 0000000000000..d78477073ad03
--- /dev/null
+++ b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
@@ -0,0 +1,224 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse_weight_embedding_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+struct SparseWeightEmbeddingGradCPUFunctor {
+  SparseWeightEmbeddingGradCPUFunctor(const Context& dev_ctx,
+                                      const DenseTensor& input,
+                                      const SelectedRows& weight,
+                                      const DenseTensor& out_grad,
+                                      int64_t padding_idx,
+                                      DenseTensor* weight_grad)
+      : dev_ctx_(dev_ctx),
+        input_(input),
+        weight_(weight),
+        out_grad_(out_grad),
+        weight_grad_(weight_grad),
+        padding_idx_(padding_idx) {}
+
+  template <typename IdT>
+  void apply() {
+    DDim table_dim = weight_.dims();
+
+    auto ids = CopyIdsToVector<IdT, int64_t>(input_);
+    auto ids_num = static_cast<int64_t>(ids.size());
+
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+    {
+      auto* d_output = &out_grad_;
+      // auto d_table = weight_grad_;
+      auto* ids_data = ids.data();
+
+      int64_t N = table_dim[0];
+      int64_t D = table_dim[1];
+
+      auto* d_output_data = d_output->template data<T>();
+
+      dev_ctx_.template Alloc<T>(weight_grad_);
+      auto* d_table_data = weight_grad_->data<T>();
+
+      memset(d_table_data, 0, weight_grad_->numel() * sizeof(T));
+
+      for (int64_t i = 0; i < ids_num; ++i) {
+        if (padding_idx_ != kNoPadding && ids_data[i] == padding_idx_) {
+          // the gradient of padding_idx should be 0, already done by memset, so
+          // do nothing.
+        } else {
+          PADDLE_ENFORCE_LT(
+              ids_data[i],
+              N,
+              phi::errors::InvalidArgument(
+                  "Variable value (input) of "
+                  "OP(paddle.nn.functional.embedding) "
+                  "expected >= 0 and < %ld, but got %ld. Please check input "
+                  "value.",
+                  N,
+                  ids_data[i]));
+          PADDLE_ENFORCE_GE(
+              ids_data[i],
+              0,
+              phi::errors::InvalidArgument(
+                  "Variable value (input) of "
+                  "OP(paddle.nn.functional.embedding) "
+                  "expected >= 0 and < %ld, but got %ld. Please check input "
+                  "value.",
+                  N,
+                  ids_data[i]));
+          for (int j = 0; j < D; ++j) {
+            d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  const Context& dev_ctx_;
+  const DenseTensor& input_;
+  const SelectedRows& weight_;
+  const DenseTensor& out_grad_;
+  DenseTensor* weight_grad_;
+  int64_t padding_idx_;
+};
+
+template <typename T, typename Context>
+struct SparseWeightEmbeddingSparseGradCPUFunctor {
+  SparseWeightEmbeddingSparseGradCPUFunctor(const Context& dev_ctx,
+                                            const DenseTensor& input,
+                                            const SelectedRows& weight,
+                                            const DenseTensor& out_grad,
+                                            int64_t padding_idx,
+                                            SelectedRows* weight_grad)
+      : dev_ctx_(dev_ctx),
+        input_(input),
+        weight_(weight),
+        out_grad_(out_grad),
+        weight_grad_(weight_grad),
+        padding_idx_(padding_idx) {}
+
+  template <typename IdT>
+  void apply() {
+    DDim table_dim = weight_.dims();
+
+    auto ids = CopyIdsToVector<IdT, int64_t>(input_);
+    auto ids_num = static_cast<int64_t>(ids.size());
+
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+    auto* d_table = weight_grad_;
+    auto* d_output = &out_grad_;
+    d_table->set_rows(ids);
+
+    auto* d_table_value = d_table->mutable_value();
+    d_table_value->Resize({ids_num, table_dim[1]});
+
+    dev_ctx_.template Alloc<T>(d_table_value);
+
+    d_table->set_height(table_dim[0]);
+
+    auto* d_output_data = d_output->template data<T>();
+    auto* d_table_data = d_table_value->template data<T>();
+
+    auto d_output_dims = d_output->dims();
+    auto d_output_dims_2d =
+        phi::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
+    PADDLE_ENFORCE_EQ(d_table_value->dims(),
+                      d_output_dims_2d,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: The shape of lookup_table@Grad and "
+                          "output@Grad should be same. "
+                          "But received lookup_table@Grad's shape = [%s], "
+                          "output@Grad's shape = [%s].",
+                          d_table_value->dims(),
+                          d_output_dims_2d));
+    memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+  }
+
+ private:
+  const Context& dev_ctx_;
+  const DenseTensor& input_;
+  const SelectedRows& weight_;
+  const DenseTensor& out_grad_;
+  SelectedRows* weight_grad_;
+  int64_t padding_idx_;
+};
+
+template <typename T, typename Context>
+void SparseWeightEmbeddingGradKernel(const Context& ctx,
+                                     const DenseTensor& input,
+                                     const SelectedRows& weight,
+                                     const DenseTensor& out_grad,
+                                     int64_t padding_idx,
+                                     DenseTensor* weight_grad) {
+  SparseWeightEmbeddingGradCPUFunctor<T, Context> functor(
+      ctx, input, weight, out_grad, padding_idx, weight_grad);
+
+  if (input.dtype() == phi::DataType::INT32) {
+    functor.template apply<int>();
+  } else if (input.dtype() == phi::DataType::INT64) {
+    functor.template apply<int64_t>();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "emebdding input only support int32 and int64"));
+  }
+}
+
+template <typename T, typename Context>
+void SparseWeightEmbeddingSparseGradKernel(const Context& ctx,
+                                           const DenseTensor& input,
+                                           const SelectedRows& weight,
+                                           const DenseTensor& out_grad,
+                                           int64_t padding_idx,
+                                           SelectedRows* weight_grad) {
+  SparseWeightEmbeddingSparseGradCPUFunctor<T, Context> functor(
+      ctx, input, weight, out_grad, padding_idx, weight_grad);
+
+  if (input.dtype() == phi::DataType::INT32) {
+    functor.template apply<int>();
+  } else if (input.dtype() == phi::DataType::INT64) {
+    functor.template apply<int64_t>();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "emebdding input only support int32 and int64"));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_weight_embedding_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SparseWeightEmbeddingGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(sparse_weight_embedding_sparse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SparseWeightEmbeddingSparseGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc b/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc
new file mode 100644
index 0000000000000..c0f95d03888b8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/embedding_kernel.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+struct EmbeddingCPUSparseFunctor {
+  EmbeddingCPUSparseFunctor(const Context& dev_ctx,
+                            const DenseTensor& input,
+                            const SelectedRows& weight,
+                            int64_t padding_idx,
+                            DenseTensor* out)
+      : dev_ctx_(dev_ctx),
+        input_(input),
+        weight_(weight),
+        out_(out),
+        padding_idx_(padding_idx) {}
+
+  template <typename IdT>
+  void apply() {
+    auto ids = CopyIdsToVector<IdT, int64_t>(input_);
+    auto ids_numel = static_cast<int64_t>(ids.size());
+
+    const auto& table_t = weight_;
+    auto output_t = out_;
+    int64_t row_width = table_t.value().dims()[1];
+    const auto* table = table_t.value().template data<T>();
+    auto* output = dev_ctx_.template Alloc<T>(output_t);
+    auto input_data_type =
+        paddle::framework::TransToProtoVarType(table_t.value().dtype());
+
+    for (int64_t i = 0; i < ids_numel; ++i) {
+      if (padding_idx_ != kNoPadding && ids[i] == padding_idx_) {
+        memset(output + i * row_width, 0, row_width * sizeof(T));
+      } else {
+        PADDLE_ENFORCE_GE(
+            ids[i],
+            0,
+            phi::errors::InvalidArgument(
+                "Variable value (input) of OP(fluid.layers.embedding) "
+                "expected >= 0. But received %ld",
+                ids[i]));
+        auto id_index = table_t.Index(ids[i]);
+        PADDLE_ENFORCE_GE(
+            id_index,
+            0,
+            phi::errors::InvalidArgument(
+                "the input key should be exists. But received %d.", id_index));
+
+        if (input_data_type == paddle::framework::proto::VarType::BF16) {
+          memcpy(output + i * row_width,
+                 table + id_index * row_width,
+                 row_width * sizeof(T));
+        } else {
+          auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx_);
+          blas.VCOPY(
+              row_width, table + id_index * row_width, output + i * row_width);
+        }
+      }
+    }
+  }
+
+ private:
+  const Context& dev_ctx_;
+  const DenseTensor& input_;
+  const SelectedRows& weight_;
+  DenseTensor* out_;
+  int64_t padding_idx_;
+};
+
+template <typename T, typename Context>
+void SparseWeightEmbeddingKernel(const Context& ctx,
+                                 const DenseTensor& input,
+                                 const SelectedRows& weight,
+                                 int64_t padding_idx,
+                                 DenseTensor* out) {
+  EmbeddingCPUSparseFunctor<T, Context> functor(
+      ctx, input, weight, padding_idx, out);
+
+  if (input.dtype() == phi::DataType::INT32) {
+    functor.template apply<int>();
+  } else if (input.dtype() == phi::DataType::INT64) {
+    functor.template apply<int64_t>();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "emebdding input only support int32 and int64"));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_weight_embedding,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SparseWeightEmbeddingKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/squeeze_grad_kernel.cc
similarity index 51%
rename from paddle/phi/kernels/cpu/reduce_prod_kernel.cc
rename to paddle/phi/kernels/cpu/squeeze_grad_kernel.cc
index 9a9bf46e948bc..5f605e6c2504b 100644
--- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
+++ b/paddle/phi/kernels/cpu/squeeze_grad_kernel.cc
@@ -12,33 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_prod_kernel.h"
+#include "paddle/phi/kernels/squeeze_grad_kernel.h"
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/reduce.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h"
 
-namespace phi {
-
-template <typename T, typename Context>
-void ReduceProdKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const std::vector<int64_t>& dims,
-                      bool keep_dim,
-                      bool reduce_all,
-                      DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(reduce_prod,
+PD_REGISTER_KERNEL(squeeze_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ReduceProdKernel,
+                   phi::SqueezeGradKernel,
                    float,
                    double,
+                   phi::dtype::bfloat16,
+                   bool,
                    int,
-                   int64_t) {}
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/shape_kernel.cc b/paddle/phi/kernels/cpu/squeeze_kernel.cc
similarity index 83%
rename from paddle/phi/kernels/cpu/shape_kernel.cc
rename to paddle/phi/kernels/cpu/squeeze_kernel.cc
index 073dc63b2a434..7d5a6ca4e884e 100644
--- a/paddle/phi/kernels/cpu/shape_kernel.cc
+++ b/paddle/phi/kernels/cpu/squeeze_kernel.cc
@@ -12,22 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/shape_kernel.h"
-#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
+#include "paddle/phi/kernels/squeeze_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/squeeze_kernel_impl.h"
 
-PD_REGISTER_KERNEL(shape,
+PD_REGISTER_KERNEL(squeeze,
                    CPU,
                    ALL_LAYOUT,
-                   phi::ShapeKernel,
+                   phi::SqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
                    bool,
                    int,
-                   int8_t,
                    uint8_t,
+                   int8_t,
                    int64_t,
-                   float,
-                   double,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
new file mode 100644
index 0000000000000..14aca258a2c71
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/tril_triu_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_kernel.cc
new file mode 100644
index 0000000000000..a3d20e55e21fb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tril_triu_kernel.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/cpu/unsqueeze_grad_kernel.cc
new file mode 100644
index 0000000000000..0cbccac4734a7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unsqueeze_grad_kernel.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unsqueeze_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeGradKernel,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/unsqueeze_kernel.cc b/paddle/phi/kernels/cpu/unsqueeze_kernel.cc
new file mode 100644
index 0000000000000..0152a31f80ba8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unsqueeze_kernel.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unsqueeze_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unsqueeze,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cumprod_grad_kernel.h b/paddle/phi/kernels/cumprod_grad_kernel.h
new file mode 100644
index 0000000000000..b3cb17b28e07f
--- /dev/null
+++ b/paddle/phi/kernels/cumprod_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& out,
+                       const DenseTensor& dout,
+                       int dim,
+                       DenseTensor* dx);
+}  // phi
diff --git a/paddle/phi/kernels/cumprod_kernel.h b/paddle/phi/kernels/cumprod_kernel.h
new file mode 100644
index 0000000000000..96d76cb0f4370
--- /dev/null
+++ b/paddle/phi/kernels/cumprod_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int dim,
+                   DenseTensor* out);
+}  // phi
diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cumsum_kernel.h
index fd90c7b8f5eee..f105c94d559d8 100644
--- a/paddle/phi/kernels/cumsum_kernel.h
+++ b/paddle/phi/kernels/cumsum_kernel.h
@@ -18,7 +18,7 @@
 
 namespace phi {
 
-template <typename Functor, typename Context>
+template <typename T, typename Context>
 void CumsumKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   int axis,
diff --git a/paddle/phi/kernels/deformable_conv_kernel.h b/paddle/phi/kernels/deformable_conv_kernel.h
new file mode 100644
index 0000000000000..3886e6801a31b
--- /dev/null
+++ b/paddle/phi/kernels/deformable_conv_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DeformableConvKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& offset,
+                          const DenseTensor& filter,
+                          const DenseTensor& mask,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::vector<int>& dilations,
+                          int deformable_groups,
+                          int groups,
+                          int im2col_step,
+                          DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/determinant_grad_kernel.h b/paddle/phi/kernels/determinant_grad_kernel.h
new file mode 100644
index 0000000000000..87228afc51b52
--- /dev/null
+++ b/paddle/phi/kernels/determinant_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DeterminantGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/determinant_kernel.h b/paddle/phi/kernels/determinant_kernel.h
new file mode 100644
index 0000000000000..abd5f5691b3e5
--- /dev/null
+++ b/paddle/phi/kernels/determinant_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DeterminantKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
similarity index 70%
rename from paddle/phi/kernels/math_kernel.cc
rename to paddle/phi/kernels/elementwise_kernel.cc
index a5d3f51e5447f..9d10a48c9e079 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -12,34 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
-template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& dims,
-                bool keep_dim,
-                DenseTensor* out) {
-  bool reduce_all = false;
-  MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
-}
-
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               DataType out_dtype,
-               bool keep_dim,
-               DenseTensor* out) {
-  bool reduce_all = false;
-  SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
-}
-
 template <typename T, typename Context>
 void AddKernel(const Context& dev_ctx,
                const DenseTensor& x,
@@ -81,25 +60,6 @@ void MultiplyKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PD_REGISTER_KERNEL(
-    mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
-
-PD_REGISTER_KERNEL(sum,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SumKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-
 PD_REGISTER_KERNEL(add,
                    CPU,
                    ALL_LAYOUT,
@@ -147,32 +107,7 @@ PD_REGISTER_KERNEL(multiply,
                    phi::dtype::bfloat16) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(mean,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MeanKernel,
-                   float,
-                   double,
-                   bool,
-                   int,
-                   int64_t,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sum,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SumKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
+
 PD_REGISTER_KERNEL(add,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
index c1e73ad91c67d..b064ecc454c59 100644
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace phi {
 
@@ -33,4 +33,100 @@ void ElementwiseFMinKernel(const Context& dev_ctx,
                            int axis,
                            DenseTensor* out);
 
+template <typename T, typename Context>
+void AddRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Add(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Subtract(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Divide(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Multiply(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/embedding_grad_kernel.h b/paddle/phi/kernels/embedding_grad_kernel.h
new file mode 100644
index 0000000000000..40ffe6ec886c4
--- /dev/null
+++ b/paddle/phi/kernels/embedding_grad_kernel.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EmbeddingGradKernel(const Context& ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& weight,
+                         const DenseTensor& out_grad,
+                         int64_t padding_idx,
+                         DenseTensor* weight_grad);
+
+template <typename T, typename Context>
+void EmbeddingSparseGradKernel(const Context& ctx,
+                               const DenseTensor& input,
+                               const DenseTensor& weight,
+                               const DenseTensor& out_grad,
+                               int64_t padding_idx,
+                               SelectedRows* weight_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/embedding_kernel.h b/paddle/phi/kernels/embedding_kernel.h
new file mode 100644
index 0000000000000..cd7d675d6dc6c
--- /dev/null
+++ b/paddle/phi/kernels/embedding_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EmbeddingKernel(const Context& ctx,
+                     const DenseTensor& inputx,
+                     const DenseTensor& weight,
+                     int64_t padding_idx,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/frobenius_norm_grad_kernel.h b/paddle/phi/kernels/frobenius_norm_grad_kernel.h
new file mode 100644
index 0000000000000..edf3aed8b8493
--- /dev/null
+++ b/paddle/phi/kernels/frobenius_norm_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FrobeniusNormGradKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             const std::vector<int64_t>& axis,
+                             bool keep_dim,
+                             bool reduce_all,
+                             DataType in_dtype,
+                             DataType out_dtype,
+                             DenseTensor* dx);
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_sum_grad_kernel.h b/paddle/phi/kernels/frobenius_norm_kernel.h
similarity index 72%
rename from paddle/phi/kernels/reduce_sum_grad_kernel.h
rename to paddle/phi/kernels/frobenius_norm_kernel.h
index ab4d63297efff..f5f37ee0c0fa5 100644
--- a/paddle/phi/kernels/reduce_sum_grad_kernel.h
+++ b/paddle/phi/kernels/frobenius_norm_kernel.h
@@ -14,19 +14,17 @@
 
 #pragma once
 
-#include "paddle/phi/common/data_type.h"
+#include <vector>
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace phi {
 
 template <typename T, typename Context>
-void ReduceSumGradKernel(const Context& dev_ctx,
+void FrobeniusNormKernel(const Context& ctx,
                          const DenseTensor& x,
-                         const DenseTensor& out_grad,
-                         const std::vector<int64_t>& dims,
+                         const std::vector<int64_t>& axis,
                          bool keep_dim,
                          bool reduce_all,
-                         DataType in_dtype,
-                         DataType out_dtype,
-                         DenseTensor* x_grad);
+                         DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc
new file mode 100644
index 0000000000000..9622bff5c255a
--- /dev/null
+++ b/paddle/phi/kernels/full_kernel.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/full_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FullBatchSizeLikeKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const std::vector<int>& shape,
+                             const Scalar& val,
+                             DataType dtype,
+                             int x_batch_size_dim,
+                             int out_batch_size_dim,
+                             DenseTensor* out) {
+  if (x.lod().size() && x_batch_size_dim == 0) {
+    // set the correct batch size for the LoDTensor.
+    auto odims = out->dims();
+    odims[out_batch_size_dim] = static_cast<int>(x.lod().back().size()) - 1;
+    FullKernel<T, Context>(dev_ctx, phi::vectorize(odims), val, dtype, out);
+  }
+  FullLikeKernel<T, Context>(dev_ctx, x, val, dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(full_batch_size_like,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FullBatchSizeLikeKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(full_batch_size_like,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FullBatchSizeLikeKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
+#endif
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index 41fc96b6db1fa..df82e651a0b26 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <vector>
+
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -37,6 +39,18 @@ void FullLikeKernel(const Context& dev_ctx,
                     DataType dtype,
                     DenseTensor* out);
 
+// In order to be compatible with fill_constant_batch_size_like op
+// that are still used in the 2.x APIs
+template <typename T, typename Context>
+void FullBatchSizeLikeKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const std::vector<int>& shape,
+                             const Scalar& val,
+                             DataType dtype,
+                             int x_batch_size_dim,
+                             int out_batch_size_dim,
+                             DenseTensor* out);
+
 template <typename T, typename Context>
 void Full(const Context& dev_ctx,
           const ScalarArray& shape,
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 1a36e4e132f41..6e536bd00a4a1 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -29,11 +29,17 @@
 #include <type_traits>
 
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+
+#ifdef PADDLE_WITH_XPU_KP
+#define __forceinline__ __inline__
+#endif
 
 namespace phi {
 namespace funcs {
@@ -513,24 +519,24 @@ struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
-struct CudaReluFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // relu(x) = max(x, 0)
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x > zero ? x : zero;
+struct TanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.tanh();
   }
 };
 
 template <typename T>
-struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
-  T zero = static_cast<T>(0.0f);
-
-  // dx = dout * (out > 0)
-  __device__ __forceinline__ T operator()(const T dout, const T out) const {
-    return out > zero ? dout : zero;
+struct TanhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) - out * out);
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() {
@@ -539,286 +545,1646 @@ struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
 };
 
 template <typename T>
-struct CudaCosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // cos(x) = cos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cos(x));
+struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  DenseTensor* dOutNew,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
+    // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
+    // * ddx)
+    if (dOutNew) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
+      auto dout_new = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad"));
+      dout_new.device(*d) =
+          static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
   }
 };
+/*
+    Out
+    DOut                            D_Dout
+    DDx     -> TanhTripleGrad ->    D_DDx
+    D_DDout                         d_OutNew
+    D_Dout_new
 
-template <typename T>
-struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    D_Dout = (-2) * Out * DDx * D_Dout_new
+    D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new
+    D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new
 
-  // dx = dout * (-sin(x))
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout * sin(x));
+    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
+    D_OutNew, D_DOut, D_DDx               // output
+*/
+template <typename T>
+struct TanhTripleGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  const DenseTensor* d_DDOut,
+                  const DenseTensor* d_dOut_New,
+                  DenseTensor* d_d_Out,
+                  DenseTensor* d_Out_New,
+                  DenseTensor* d_DDx) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad"));
+    auto dout = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad"));
+    auto d_ddOut = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad"));
+    auto d_dOutNew = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad"));
+
+    if (d_Out_New) {
+      auto d_OutNew = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad"));
+      d_OutNew.device(*d) = (static_cast<T>(-2) * out * ddx * d_ddOut) -
+                            (static_cast<T>(2) * dout * ddx * d_dOutNew);
+    }
+    if (d_d_Out) {
+      auto d_dOut = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad"));
+      d_dOut.device(*d) = static_cast<T>(-2) * out * ddx * d_dOutNew;
+    }
+    if (d_DDx) {
+      auto d_ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad"));
+      d_ddx.device(*d) = (static_cast<T>(1) - (out * out)) * d_ddOut -
+                         static_cast<T>(2) * out * dout * d_dOutNew;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
   }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaSinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct BReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
 
-  // sin(x) = sin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sin(x));
+  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
+  // not polymorphism for speed.
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
   }
 };
 
 template <typename T>
-struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // dx = dout * cos(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cos(x));
+struct BReluGradFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
+                       .template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaTanFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct LeakyReluFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
 
-  // tan(x) = tan(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(tan(x));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    if (alpha < 1.f) {
+      out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+    } else {
+      out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
+    }
   }
 };
 
 template <typename T>
-struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // dx = dout / cos(x)^2
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / (cos(x) * cos(x)));
+struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 =
+        static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAsinFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // asin(x) = asin(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asin(x));
+struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
   }
-};
-
-template <typename T>
-struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // dx = dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout / sqrt(one - x * x));
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    if (ddOut) {
+      auto* d = dev.eigen_device();
+      auto ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
+      auto x = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
+      ddout.device(*d) =
+          ddx *
+          ((x > static_cast<T>(0)).template cast<T>() +
+           static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
+              .template cast<T>();
+    }
   }
-
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAcosFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
 
-  // acos(x) = acos(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acos(x));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto th = static_cast<T>(threshold);
+    out.device(d) = (x > th).template cast<T>() * x;
   }
 };
 
 template <typename T>
-struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
+struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
 
-  // dx = -dout / sqrt(1 - x^2)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(-dout / sqrt(one - x * x));
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto th = static_cast<T>(threshold);
+    dx.device(d) = dout * (x > th).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
-struct CudaCoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // cosh(x) = cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(cosh(x));
+struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x - x.tanh();
   }
 };
 
 template <typename T>
-struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // dx = dout * sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * sinh(x));
+struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x.tanh() * x.tanh());
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
-struct CudaSinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct HardShrinkFunctor : public BaseActivationFunctor<T> {
+  float threshold;
 
-  // sinh(x) = sinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(sinh(x));
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    out.device(d) = x * (temp1 || temp2).template cast<T>();
   }
 };
 
 template <typename T>
-struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
 
-  // dx = dout * cosh(x)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * cosh(x));
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = x < static_cast<T>(threshold * -1.f);
+    auto temp2 = x > static_cast<T>(threshold);
+    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
+// otherwise
 template <typename T>
-struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
 
-  // Acosh(x) = acosh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(acosh(x));
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>();
+    auto temp2 = (x < -lambdaT).template cast<T>();
+    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
   }
 };
 
 template <typename T>
-struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1 / sqrt(x^2 - 1)
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x - one));
+struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>();
+    auto temp2 = (x < -lambdaT).template cast<T>();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+struct ELUFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
 
-  // Asinh(x) = asinh(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        (x < static_cast<T>(0))
+            .select(static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)), x);
+  }
+};
+
+template <typename T>
+struct ELUGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    // case 1: alpha >= 0
+    // dx = dout, if out > 0
+    // dx = dout * (out + alpha), if out <= 0
+    dx.device(d) = (out > static_cast<T>(0))
+                       .select(dout, dout * (out + static_cast<T>(alpha)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    // case 2: alpha < 0
+    // dx = dout, if x > 0
+    // dx = dout * (out + alpha), if x <=0
+    dx.device(d) = (x > static_cast<T>(0))
+                       .select(dout, dout * static_cast<T>(alpha) * x.exp());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  const DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
+    auto x = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));
+
+    if (dX) {
+      auto dx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
+      dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
+                      (x <= static_cast<T>(0)).template cast<T>();
+    }
+
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
+      ddout.device(*d) = ddx *
+                         ((x > static_cast<T>(0)).template cast<T>() +
+                          static_cast<T>(alpha) * x.exp() *
+                              (x <= static_cast<T>(0)).template cast<T>())
+                             .template cast<T>();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// silu(x) = x / (1 + exp(-x))
+template <typename T>
+struct SiluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    out.device(d) = x * temp;
+  }
+};
+
+// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
+template <typename T>
+struct SiluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
+    auto temp2 = x * (-x).exp();                  // x*e^(-x)
+    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
+                           (static_cast<T>(1) + (temp2 / temp1)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// sigmoid(x) = 1 / (1 + exp(-x))
+template <typename T>
+struct SigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+  }
+};
+
+template <typename T>
+struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out * (static_cast<T>(1) - out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+/*
+    Out
+    DOut -> SigmoidGradGrad -> DOutNew
+    DDX                        DDOut
+
+    DDOut = (1-Out)*Out*DDX
+    DOutNew = (1-2*Out)*DOut*DDX
+*/
+template <typename T>
+struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  DenseTensor* dOutNew,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
+
+    if (dOutNew) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
+      auto dout_new = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad"));
+      dout_new.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+/*
+    Out
+    DOut                            D_Dout
+    DDx     -> SigmoidTripleGrad -> D_DDx
+    D_DDout                         d_OutNew
+    D_Dout_new
+
+    D_Dout = (1-2*Out)*DDx*D_Dout_new
+    D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
+    D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
+
+    Out, DDX, DOut, D_DDOut, D_DOut_New   // input
+    D_OutNew, D_DOut, D_DDx               // output
+*/
+template <typename T>
+struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* ddX,
+                  const DenseTensor* dOut,
+                  const DenseTensor* d_DDOut,
+                  const DenseTensor* d_dOut_New,
+                  DenseTensor* d_d_Out,
+                  DenseTensor* d_Out_New,
+                  DenseTensor* d_DDx) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad"));
+    auto dout = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad"));
+    auto d_ddOut = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
+    auto d_dOutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
+        d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad"));
+
+    if (d_Out_New) {
+      auto d_OutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
+          d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad"));
+      d_OutNew.device(*d) = (ddx - static_cast<T>(2) * out * ddx) * d_ddOut -
+                            static_cast<T>(2) * dout * ddx * d_dOutNew;
+    }
+    if (d_d_Out) {
+      auto d_dOut = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad"));
+      d_dOut.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * ddx * d_dOutNew;
+    }
+    if (d_DDx) {
+      auto d_ddx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad"));
+      d_ddx.device(*d) =
+          (static_cast<T>(1) - out) * out * d_ddOut +
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+// Originally: logsigmoid(x) = -log (1 + exp(-x))
+// For numerical stability, we can use the log-sum-exp trick:
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// We can rewrite the above equation as:
+// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
+//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
+//           max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
+//
+// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
+// + exp(-x - max(-x, 0))))
+template <typename T>
+struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
+  }
+};
+
+// Originally: f' = exp(-x) / (1 + exp(-x))
+// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
+// exp(-x - max(-x, 0)))
+template <typename T>
+struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    dx.device(d) =
+        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    out.device(d) =
+        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
+                       .template cast<T>() *
+                   static_cast<T>(slope);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+// log(x) = natural logarithm of x
+template <typename T>
+struct LogFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log();
+  }
+};
+
+template <typename T>
+struct LogGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) / x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// log2(x) = logarithm to the base 2 of the elements of x
+template <typename T>
+struct Log2Functor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log() / static_cast<T>(log(2));
+  }
+};
+
+// the gradient of log2(x) is 1/(x*ln(2))
+template <typename T>
+struct Log2GradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// log10(x) = logarithm to the base 10 of the elements of x
+template <typename T>
+struct Log10Functor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log() / static_cast<T>(log(10));
+  }
+};
+
+// the gradient of log10(x) is 1/(x*ln(10))
+template <typename T>
+struct Log10GradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// log1p(x) = natural logarithm of x+1
+template <typename T>
+struct Log1pFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = (static_cast<T>(1) + x).log();
+  }
+};
+
+template <typename T>
+struct Log1pGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct LogGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  const DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
+    auto x = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
+    // ddout = ddx / x; dx = -(dout / x) * (ddx / x)
+    // calculate dx first, so ddout can inplace ddx
+    if (dX) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
+      auto dx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
+      dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
+      ddout.device(*d) = ddx * static_cast<T>(1) / x;
+    }
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+template <typename T>
+struct CudaReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // relu(x) = max(x, 0)
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > zero ? x : zero;
+  }
+};
+
+template <typename T>
+struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // dx = dout * (out > 0)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return out > zero ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaCosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cos(x) = cos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cos(x));
+  }
+};
+
+template <typename T>
+struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * (-sin(x))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout * sin(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sin(x) = sin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sin(x));
+  }
+};
+
+template <typename T>
+struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cos(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cos(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tan(x) = tan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(tan(x));
+  }
+};
+
+template <typename T>
+struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout / cos(x)^2
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / (cos(x) * cos(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // asin(x) = asin(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asin(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // acos(x) = acos(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acos(x));
+  }
+};
+
+template <typename T>
+struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = -dout / sqrt(1 - x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(-dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaCoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // cosh(x) = cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(cosh(x));
+  }
+};
+
+template <typename T>
+struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * sinh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // sinh(x) = sinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(sinh(x));
+  }
+};
+
+template <typename T>
+struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // dx = dout * cosh(x)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * cosh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Acosh(x) = acosh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(acosh(x));
+  }
+};
+
+template <typename T>
+struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1 / sqrt(x^2 - 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x - one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Asinh(x) = asinh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(asinh(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout * 1/sqrt(x^2 + 1)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / sqrt(x * x + one));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // Atanh(x) = atanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atanh(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // dx = dout * 1/(1- x^2)
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(dout * one / (one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaAtanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // atan(x) = atan(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(atan(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x^2)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (one + x * x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tanh(x) = tanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(tanh(x));
+  }
+};
+
+template <typename T>
+struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout * (1 - out^2)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * (one - out * out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaBReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // brelu(x) = min(max(x, t_min), t_max)
+  __device__ __forceinline__ T operator()(const T x) const {
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    T temp_max = x > t_min_cast ? x : t_min_cast;
+    T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // dx = (x > t_min && x < t_max) ? dout : 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    return (x > t_min_cast && x < t_max_cast) ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // thresholded_relu(x) = x > threshold ? x : 0
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > static_cast<T>(threshold) ? x : zero;
+  }
+};
+
+template <typename T>
+struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = x > threshold ? dout : 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return x > static_cast<T>(threshold) ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // leakyrelu(x) = x > 0 ? x : alpha * x
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x > zero ? x : static_cast<T>(alpha) * x;
+  }
+};
+
+template <typename T>
+struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // dx = dout * (x > 0 ? 1 : alpha)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return x > zero ? dout : static_cast<T>(alpha) * dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  // softshrink(x) = x - lambda, if x > lambda;
+  //                 x + lambda, if x < -lambda;
+  //                 0, otherwise.
+  __device__ __forceinline__ T operator()(const T x) const {
+    T l = static_cast<T>(lambda);
+    T temp1 = static_cast<T>(x > l);
+    T temp2 = static_cast<T>(x < -l);
+    return temp1 * (x - l) + temp2 * (x + l);
+  }
+};
+
+template <typename T>
+struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float lambda;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  // dx = dout, if x > lambda or x < -lambda else 0
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T l = static_cast<T>(lambda);
+    return (x >= -l && x <= l) ? zero : dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // tanhshrink(x) = x - tanh(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(asinh(x));
+    return static_cast<T>(x - tanh(x));
   }
 };
 
 template <typename T>
-struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
+struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
 
-  // dx = dout * 1/sqrt(x^2 + 1)
+  // dx = dout * tanh(x)^2
   __device__ __forceinline__ T operator()(const T arg_dout,
                                           const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / sqrt(x * x + one));
+    return static_cast<T>(dout * tanh(x) * tanh(x));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
+struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
+  __device__ __forceinline__ T operator()(const T x) const {
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : x;
+  }
+};
+
+template <typename T>
+struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = (x > -threshold && x < threshold) ? 0 : dout
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaELUFunctor : public BaseActivationFunctor<T> {
+  using CT = typename phi::dtype::MPTypeTrait<T>::Type;
+  CT zero = static_cast<CT>(0.0f);
+  CT one = static_cast<CT>(1.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // elu(x) = x, if x > 0
+  // elu(x) = alpha * (e^x - 1), if x <= 0
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    CT x = static_cast<CT>(arg_x);
+    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
+    CT res = x > zero ? x : temp;
+    return static_cast<T>(res);
+  }
+};
+
+template <typename T>
+struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+  float alpha;
 
-  // Atanh(x) = atanh(x)
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // case 1: alpha >= 0
+  // dx = dout, if out > 0
+  // dx = dout * (out + alpha), if out <= 0
+  __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType out = static_cast<MPType>(arg_out);
+    MPType a = static_cast<MPType>(alpha);
+    MPType out_pos = static_cast<MPType>(out > zero);
+    MPType out_neg = static_cast<MPType>(out <= zero);
+    return static_cast<T>(dout * (out_pos + out_neg * (out + a)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // case 2: alpha < 0
+  // dx = dout, if x > 0
+  // dx = dout * (out + alpha), if x <=0
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_out,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType out = static_cast<MPType>(arg_out);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType a = static_cast<MPType>(alpha);
+    MPType x_pos = static_cast<MPType>(x > zero);
+    MPType x_neg = static_cast<MPType>(x <= zero);
+    return static_cast<T>(dout * (x_pos + x_neg * (out + a)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaSiluFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // silu(x) = x / (1 + exp(-x))
   __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atanh(x));
+    return static_cast<T>(x / (one + exp(-x)));
   }
 };
 
 template <typename T>
-struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
+struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
   MPType one = static_cast<MPType>(1.0f);
-  // dx = dout * 1/(1- x^2)
+
+  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
   __device__ __forceinline__ T operator()(const T arg_dout,
                                           const T arg_x) const {
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(dout * one / (one - x * x));
+    MPType temp = one / (one + exp(-x));
+    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
 template <typename T>
-struct CudaAtanFunctor : public BaseActivationFunctor<T> {
+struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
 
-  // atan(x) = atan(x)
+  // sigmoid(x) = 1 / (1 + exp(-x))
   __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(atan(x));
+    return static_cast<T>(one / (one + exp(-x)));
   }
 };
 
 template <typename T>
-struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
   T one = static_cast<T>(1.0f);
 
-  // dx = dout / (1 + x^2)
+  // dx = dout * out * (1 - out)
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return dout * out * (one - out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // logsigmoid(x) = log(1 / (1 + exp(-x)))
+  // For numerical stability,
+  // logsigmoid(x) =
+  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType temp = x > zero ? zero : -x;
+    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
+  }
+};
+
+template <typename T>
+struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // dx = dout * exp(-x) / (1 + exp(-x))
+  // For numerical stability:
+  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
+  // 0)))
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType temp1 = x > zero ? zero : -x;
+    MPType temp2 = exp(-x - temp1);
+    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // hard_sigmoid(x) = 0, when x <= -3
+  //                   1, when x >= 3
+  //                   x * slope + offset, otherwise
+  __device__ __forceinline__ T operator()(const T x) const {
+    T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    T temp_max = temp > zero ? temp : zero;
+    T temp_min = temp_max < one ? temp_max : one;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // dx = (out > 0 && out < 1) ? dout * slope : 0
+  __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CudaLogFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // log(x) = log(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log(x));
+  }
+};
+
+template <typename T>
+struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout / x
   __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (one + x * x);
+    return dout / x;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // log1p(x) = log(1 + x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log(one + x));
+  }
+};
+
+template <typename T>
+struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (one + x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaLog2Functor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // log2(x) = log2(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log2(x));
+  }
+};
+
+template <typename T>
+struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
+
+  // dx = dout / (x * log(2))
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (x * log_two);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaLog10Functor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // log10(x) = log10(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log10(x));
+  }
+};
+
+template <typename T>
+struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
+
+  // dx = dout / (x * log(10))
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (x * log_ten);
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h
index 21ebae8487ffc..a7ed7d36eb1c4 100644
--- a/paddle/phi/kernels/funcs/batch_norm_utils.h
+++ b/paddle/phi/kernels/funcs/batch_norm_utils.h
@@ -36,8 +36,7 @@ inline void ResizeToChannelFirst(const DeviceContext& context,
     in_dims_vec[3] = input->dims()[2];
     in_dims_vec[4] = input->dims()[3];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
-
+    context.template Alloc<T>(transformed_input);
   } else if (dim == 2) {
     // input
     transformed_input->Resize(input->dims());
@@ -47,7 +46,7 @@ inline void ResizeToChannelFirst(const DeviceContext& context,
     in_dims_vec[2] = input->dims()[1];
     in_dims_vec[3] = input->dims()[2];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
+    context.template Alloc<T>(transformed_input);
   } else if (dim == 1) {
     transformed_input->Resize(input->dims());
 
@@ -55,7 +54,7 @@ inline void ResizeToChannelFirst(const DeviceContext& context,
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[1];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
+    context.template Alloc<T>(transformed_input);
   }
 }
 
@@ -74,7 +73,7 @@ inline void ResizeToChannelLast(const DeviceContext& context,
     in_dims_vec[3] = input->dims()[4];
     in_dims_vec[4] = input->dims()[1];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
+    context.template Alloc<T>(transformed_input);
 
   } else if (dim == 2) {
     // input
@@ -85,7 +84,7 @@ inline void ResizeToChannelLast(const DeviceContext& context,
     in_dims_vec[2] = input->dims()[3];
     in_dims_vec[3] = input->dims()[1];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
+    context.template Alloc<T>(transformed_input);
   } else if (dim == 1) {
     transformed_input->Resize(input->dims());
 
@@ -93,7 +92,7 @@ inline void ResizeToChannelLast(const DeviceContext& context,
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[1];
     transformed_input->Resize(make_ddim(in_dims_vec));
-    transformed_input->mutable_data<T>(context.GetPlace());
+    context.template Alloc<T>(transformed_input);
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/cumprod.h b/paddle/phi/kernels/funcs/cumprod.h
new file mode 100644
index 0000000000000..ac40523c1c437
--- /dev/null
+++ b/paddle/phi/kernels/funcs/cumprod.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+static void GetCumprodDimInfo(const DDim& dim,
+                              int cumprod_dim,
+                              size_t* outer_dim,
+                              size_t* mid_dim,
+                              size_t* inner_dim) {
+  PADDLE_ENFORCE_GE(
+      cumprod_dim,
+      -dim.size(),
+      phi::errors::InvalidArgument(
+          "The input dim of CumprodOp should be larger than the opposite "
+          "rank of input x which is %d.But received dim=%d",
+          -dim.size(),
+          cumprod_dim));
+  PADDLE_ENFORCE_LT(cumprod_dim,
+                    dim.size(),
+                    phi::errors::InvalidArgument(
+                        "The input dim of CumprodOp should be smaller than the "
+                        "rank of input x which is %d.But received dim=%d",
+                        dim.size(),
+                        cumprod_dim));
+  if (cumprod_dim < 0) cumprod_dim += dim.size();
+
+  *outer_dim = 1;
+  for (int i = 0; i < cumprod_dim; ++i) {
+    *outer_dim *= dim[i];
+  }
+  *mid_dim = dim[cumprod_dim];
+  *inner_dim = 1;
+  for (int i = cumprod_dim + 1; i < dim.size(); ++i) {
+    *inner_dim *= dim[i];
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index acc31d68b7859..68e986c334ecb 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -50,11 +50,15 @@ struct exponential_transform {
 
   HOSTDEVICE inline T operator()(T val) const {
 #if defined(__NVCC__) || defined(__HIPCC__)
-    if (std::is_same<T, double>::value) {
-      return static_cast<T>(-1.0) / lambda_ * log(val);
-    } else {
-      return static_cast<T>(-1.0) / lambda_ * __logf(val);
+    T log = -std::numeric_limits<T>::epsilon() / 2;
+    if (val < static_cast<T>(1.) - std::numeric_limits<T>::epsilon() / 2) {
+      if (std::is_same<T, double>::value) {
+        log = logf(val);
+      } else {
+        log = __logf(val);
+      }
     }
+    return static_cast<T>(-1.0) / lambda_ * log;
 #else
     return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
 #endif
@@ -114,13 +118,19 @@ struct normal_transform {
 namespace kps = phi::kps;
 
 /*********************** Distribution Function *************************/
-template <typename T>
-struct uniform_distribution;
 
 template <typename T>
 struct normal_distribution;
 
 #if defined(__NVCC__)
+template <typename T>
+struct uniform_distribution {
+  __device__ inline T operator()(curandStatePhilox4_32_10_t *state) const {
+    return static_cast<T>(curand_uniform(state));
+  }
+  static constexpr int kReturnsCount = 1;
+};
+
 template <>
 struct uniform_distribution<float> {
   __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
@@ -177,6 +187,14 @@ struct normal_distribution<double> {
 };
 
 #else
+template <typename T>
+struct uniform_distribution {
+  __device__ inline T operator()(hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_uniform(state);
+  }
+  static constexpr int kReturnsCount = 1;
+};
+
 template <>
 struct uniform_distribution<float> {
   __device__ inline float4 operator()(
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index f9e66836a6269..ac262fe2d571e 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -67,6 +67,11 @@ struct InverseMultiplyFunctor<bool> {
   }
 };
 
+template <typename T>
+struct IsZeroFunctor {
+  HOSTDEVICE bool operator()(T x) const { return x == static_cast<T>(0); }
+};
+
 // Divide
 #define DIV_ERROR_INFO                                             \
   "InvalidArgumentError: Integer division by zero encountered in " \
diff --git a/paddle/phi/kernels/funcs/embedding_util.h b/paddle/phi/kernels/funcs/embedding_util.h
new file mode 100644
index 0000000000000..20c4ddca05460
--- /dev/null
+++ b/paddle/phi/kernels/funcs/embedding_util.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+constexpr int64_t kNoPadding = -1;
+
+template <typename InT, typename OutT>
+static std::vector<OutT> CopyIdsToVector(const DenseTensor &ids) {
+  auto numel = ids.numel();
+  const auto *src = ids.data<InT>();
+  std::vector<OutT> ret(numel);
+  if (std::is_same<InT, OutT>::value) {
+    std::memcpy(ret.data(), src, numel * sizeof(InT));
+  } else {
+    for (decltype(numel) i = 0; i < numel; ++i) {
+      ret[i] = src[i];
+    }
+  }
+  return ret;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/inclusive_scan.h b/paddle/phi/kernels/funcs/inclusive_scan.h
new file mode 100644
index 0000000000000..b285c5bdbbfc0
--- /dev/null
+++ b/paddle/phi/kernels/funcs/inclusive_scan.h
@@ -0,0 +1,274 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/malloc.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+struct IsComplex : public std::false_type {};
+
+template <>
+struct IsComplex<::phi::dtype::complex<float>> : public std::true_type {};
+
+template <>
+struct IsComplex<::phi::dtype::complex<double>> : public std::true_type {};
+
+template <typename InputIterator, typename OutputIterator, typename BinaryOp>
+static void CubInclusiveScan(InputIterator x_iter,
+                             OutputIterator y_iter,
+                             size_t n,
+                             BinaryOp op,
+                             const phi::GPUContext &dev_ctx) {
+  paddle::memory::allocation::AllocationPtr allocation;
+  void *temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  for (size_t i = 0; i < 2; ++i) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cub::DeviceScan::InclusiveScan(temp_storage,
+                                       temp_storage_bytes,
+                                       x_iter,
+                                       y_iter,
+                                       op,
+                                       static_cast<int>(n),
+                                       dev_ctx.stream()));
+    if (i == 0 && temp_storage_bytes > 0) {
+      allocation =
+          paddle::memory::Alloc(dev_ctx.GetPlace(), temp_storage_bytes);
+      temp_storage = allocation->ptr();
+    }
+  }
+}
+
+template <typename T>
+static auto MakeThrustReverseIterator(T *x) {
+  return thrust::reverse_iterator<thrust::device_ptr<T>>(
+      thrust::device_pointer_cast(x));
+}
+
+template <typename T, typename BinaryOp, bool kReverse>
+struct InclusiveScanOuterOrMidDimFunctor {
+  HOSTDEVICE InclusiveScanOuterOrMidDimFunctor(
+      const T *x, T *y, size_t mid_dim, size_t inner_dim, T init, BinaryOp op)
+      : x_(x),
+        y_(y),
+        mid_dim_(mid_dim),
+        inner_dim_(inner_dim),
+        init_(init),
+        op_(op) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    auto outer_idx = idx / inner_dim_;
+    auto inner_idx = idx % inner_dim_;
+    if (kReverse) {
+      idx = outer_idx * mid_dim_ * inner_dim_ + (mid_dim_ - 1) * inner_dim_ +
+            inner_idx;
+    } else {
+      idx = outer_idx * mid_dim_ * inner_dim_ + inner_idx;
+    }
+
+    auto x_ptr = x_ + idx;
+    auto y_ptr = y_ + idx;
+    T acc_value = init_;
+    for (size_t i = 0; i < mid_dim_; ++i) {
+      acc_value = op_(acc_value, *x_ptr);
+      *y_ptr = acc_value;
+      if (kReverse) {
+        x_ptr -= inner_dim_;
+        y_ptr -= inner_dim_;
+      } else {
+        x_ptr += inner_dim_;
+        y_ptr += inner_dim_;
+      }
+    }
+  }
+
+ private:
+  const T *x_;
+  T *y_;
+  size_t mid_dim_;
+  size_t inner_dim_;
+  T init_;
+  BinaryOp op_;
+};
+
+template <typename T,
+          typename BinaryOp,
+          size_t kThreadNumX,
+          size_t kThreadNumY,
+          bool kReverse>
+static __global__ void InclusiveScanInnerDimCUDAKernel(
+    const T *x, T *y, size_t num_rows, size_t row_size, T init, BinaryOp op) {
+  using RealT = phi::dtype::Real<T>;
+  constexpr auto kSharedBufferSize =
+      IsComplex<T>::value ? 4 * kThreadNumX : 2 * kThreadNumX;
+  __shared__ RealT sbuf[kThreadNumY][kSharedBufferSize];
+  T *row_buf = reinterpret_cast<T *>(sbuf[threadIdx.y]);
+
+  size_t block_row = static_cast<size_t>(blockIdx.x * kThreadNumY);
+  size_t block_row_stride = static_cast<size_t>(gridDim.x * kThreadNumY);
+  for (; block_row < num_rows; block_row += block_row_stride) {
+    size_t row = block_row + threadIdx.y;
+    T block_total = init;
+
+    const T *row_x = x + row * row_size;
+    T *row_y = y + row * row_size;
+    for (size_t block_col = 0; block_col < row_size;
+         block_col += 2 * kThreadNumX) {
+      size_t col1, col2;
+      if (kReverse) {
+        col1 = row_size - 1 - block_col - threadIdx.x;
+        col2 = col1 - kThreadNumX;
+      } else {
+        col1 = block_col + threadIdx.x;
+        col2 = col1 + kThreadNumX;
+      }
+
+      if (row < num_rows) {
+        if (col1 < row_size) {
+          row_buf[threadIdx.x] = row_x[col1];
+        } else {
+          row_buf[threadIdx.x] = init;
+        }
+
+        if (col2 < row_size) {
+          row_buf[kThreadNumX + threadIdx.x] = row_x[col2];
+        } else {
+          row_buf[kThreadNumX + threadIdx.x] = init;
+        }
+
+        if (threadIdx.x == 0) {
+          row_buf[0] = op(row_buf[0], block_total);
+        }
+      }
+      __syncthreads();
+
+      for (size_t s = kThreadNumX, d = 1; s >= 1; s >>= 1, d <<= 1) {
+        if (row < num_rows && threadIdx.x < s) {
+          size_t offset = (2 * threadIdx.x + 1) * d - 1;
+          row_buf[offset + d] = op(row_buf[offset], row_buf[offset + d]);
+        }
+        __syncthreads();
+      }
+
+      for (size_t s = 2, d = kThreadNumX / 2; d >= 1; s <<= 1, d >>= 1) {
+        if (row < num_rows && threadIdx.x < s - 1) {
+          size_t offset = 2 * (threadIdx.x + 1) * d - 1;
+          row_buf[offset + d] = op(row_buf[offset], row_buf[offset + d]);
+        }
+        __syncthreads();
+      }
+
+      if (row < num_rows) {
+        if (col1 < row_size) row_y[col1] = row_buf[threadIdx.x];
+        if (col2 < row_size) row_y[col2] = row_buf[kThreadNumX + threadIdx.x];
+      }
+      block_total = row_buf[2 * kThreadNumX - 1];
+      __syncthreads();
+    }
+  }
+}
+
+template <typename T, typename BinaryOp>
+static void InclusiveScanInnerDim(const T *x,
+                                  T *y,
+                                  size_t outer_dim,
+                                  size_t inner_dim,
+                                  T init,
+                                  BinaryOp op,
+                                  bool reverse,
+                                  const phi::GPUContext &dev_ctx) {
+  constexpr size_t kThreadNumX = 16;
+  constexpr size_t kThreadNumY = 32;
+
+  size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
+  grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
+  dim3 thread_dims(kThreadNumX, kThreadNumY);
+  if (reverse) {
+    InclusiveScanInnerDimCUDAKernel<
+        T,
+        BinaryOp,
+        kThreadNumX,
+        kThreadNumY,
+        /*kReverse=*/true><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
+        x, y, outer_dim, inner_dim, init, op);
+  } else {
+    InclusiveScanInnerDimCUDAKernel<
+        T,
+        BinaryOp,
+        kThreadNumX,
+        kThreadNumY,
+        /*kReverse=*/false><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
+        x, y, outer_dim, inner_dim, init, op);
+  }
+}
+
+template <typename T, typename BinaryOp>
+void InclusiveScan(const T *x,
+                   T *y,
+                   size_t outer_dim,
+                   size_t mid_dim,
+                   size_t inner_dim,
+                   T init,
+                   BinaryOp op,
+                   bool reverse,
+                   const phi::GPUContext &dev_ctx) {
+  if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
+
+  if (outer_dim == 1 && inner_dim == 1) {
+    if (reverse) {
+      auto x_reverse_iter = MakeThrustReverseIterator(x + mid_dim);
+      auto y_reverse_iter = MakeThrustReverseIterator(y + mid_dim);
+      CubInclusiveScan(x_reverse_iter, y_reverse_iter, mid_dim, op, dev_ctx);
+    } else {
+      CubInclusiveScan(x, y, mid_dim, op, dev_ctx);
+    }
+  } else if (inner_dim != 1) {
+    phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx,
+                                                    outer_dim * inner_dim);
+    if (reverse) {
+      for_range(
+          InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/true>(
+              x, y, mid_dim, inner_dim, init, op));
+    } else {
+      for_range(
+          InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/false>(
+              x, y, mid_dim, inner_dim, init, op));
+    }
+  } else {
+    InclusiveScanInnerDim<T, BinaryOp>(
+        x, y, outer_dim, mid_dim, init, op, reverse, dev_ctx);
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h
new file mode 100644
index 0000000000000..e78730cbf3849
--- /dev/null
+++ b/paddle/phi/kernels/funcs/layer_norm_util.h
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+// Wrap RowwiseMean and ColwiseMean.
+// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
+// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
+// implementation only considers 2D.
+template <typename DeviceContext, typename T>
+struct RowwiseMean2D {
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx);
+
+  void operator()(const DeviceContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* vec);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class RowwiseMean2D<phi::GPUContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx)
+      : left_(left), right_(right) {
+    DDim ones_dim({right_});
+    divisor_.Resize(ones_dim);
+    dev_ctx.template Alloc<T>(&divisor_);
+    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
+  }
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    phi::funcs::GetBlas<phi::GPUContext, T>(context).GEMV(false,
+                                                          left_,
+                                                          right_,
+                                                          1.,
+                                                          input.data<T>(),
+                                                          divisor_.data<T>(),
+                                                          0.,
+                                                          out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  DenseTensor divisor_;
+};
+#endif
+
+template <typename T>
+class RowwiseMean2D<phi::CPUContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx) {}
+
+  void operator()(const phi::CPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    row_mean_(context, input, out);
+  }
+
+ private:
+  phi::funcs::RowwiseMean<phi::CPUContext, T> row_mean_;
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum2D {
+  ColwiseSum2D(int left, int right, const DeviceContext& dev_ctx);
+
+  void operator()(const phi::DeviceContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* vec);
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class ColwiseSum2D<phi::GPUContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const phi::GPUContext& dev_ctx)
+      : left_(left), right_(right) {
+    DDim ones_dim({left_});
+    divisor_.Resize(ones_dim);
+    dev_ctx.template Alloc<T>(&divisor_);
+    phi::funcs::set_constant(dev_ctx, &divisor_, 1.0);
+  }
+
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    phi::funcs::GetBlas<phi::GPUContext, T>(context).GEMV(true,
+                                                          left_,
+                                                          right_,
+                                                          1.,
+                                                          input.data<T>(),
+                                                          divisor_.data<T>(),
+                                                          0.,
+                                                          out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  DenseTensor divisor_;
+};
+#endif
+
+template <typename T>
+class ColwiseSum2D<phi::CPUContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const phi::CPUContext& dev_ctx) {}
+
+  void operator()(const phi::CPUContext& context,
+                  const DenseTensor& input,
+                  DenseTensor* out) {
+    col_wise_(context, input, out);
+  }
+
+ private:
+  phi::funcs::ColwiseSum<phi::CPUContext, T> col_wise_;
+};
+
+template <typename T>
+struct SubAndSquareFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
+};
+
+template <typename T>
+struct DivAndSqrtFunctor {
+  explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; }
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a / (sqrt(b + epsilon_));
+  }
+
+ private:
+  T epsilon_;
+};
+
+template <typename T>
+struct MulInvVarFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    return a * std::sqrt(1.0 / b);
+  }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 4201a75be8ac7..afa2214f5b9df 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -331,12 +331,20 @@ template struct ColwiseSum<paddle::platform::CPUDeviceContext, double>;
 template struct ColwiseSum<paddle::platform::CPUDeviceContext, int>;
 template struct ColwiseSum<paddle::platform::CPUDeviceContext, int64_t>;
 
+template struct ColwiseSum<phi::CPUContext, float>;
+template struct ColwiseSum<phi::CPUContext, double>;
+template struct ColwiseSum<phi::CPUContext, int>;
+template struct ColwiseSum<phi::CPUContext, int64_t>;
+
 template struct RowwiseSum<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseSum<paddle::platform::CPUDeviceContext, double>;
 
 template struct RowwiseMean<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseMean<paddle::platform::CPUDeviceContext, double>;
 
+template struct RowwiseMean<phi::CPUContext, float>;
+template struct RowwiseMean<phi::CPUContext, double>;
+
 template <typename T>
 struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
   void operator()(paddle::platform::CPUDeviceContext* ctx,
diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h
new file mode 100644
index 0000000000000..3bd6c19545e16
--- /dev/null
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -0,0 +1,197 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/extrema.h>
+#include <thrust/functional.h>
+#include <thrust/inner_product.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <utility>
+#include <vector>
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+static int ComputeBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+static inline void GetDims(
+    const phi::DDim& dim, int axis, int* pre, int* n, int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
+template <typename T, typename Type>
+static void GetMode(Type input_height,
+                    Type input_width,
+                    int input_dim,
+                    const DenseTensor* input,
+                    T* t_out,
+                    Type* t_indices) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    std::sort(col_vec.begin(),
+              col_vec.end(),
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+    T mode = 0;
+    int64_t indice = 0;
+    int64_t cur_freq = 0;
+    int64_t max_freq = 0;
+    for (int64_t i = 0; i < input_width; ++i) {
+      ++cur_freq;
+      if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
+        if (cur_freq > max_freq) {
+          max_freq = cur_freq;
+          mode = col_vec[i].first;
+          indice = col_vec[i].second;
+        }
+        cur_freq = 0;
+      }
+    }
+    t_out[i] = mode;
+    t_indices[i] = indice;
+  }
+}
+
+template <typename T, typename Type>
+static void ModeAssign(const Type& input_height,
+                       const Type& input_width,
+                       const int& input_dim,
+                       const DenseTensor* input,
+                       const DenseTensor* indices,
+                       T* output_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = EigenVector<T>::Flatten(*input);
+      auto e_indices = EigenVector<Type>::Flatten(*indices);
+      output_data[i * input_width + e_indices(0)] = e_input(0);
+    } else {
+      auto e_input = EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices = EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
+    }
+  }
+}
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+static void GetModebySort(const phi::GPUContext& dev_ctx,
+                          const DenseTensor* input_tensor,
+                          const int64_t num_cols,
+                          const int64_t num_rows,
+                          T* out_tensor,
+                          int64_t* indices_tensor) {
+  DenseTensor input_tmp;
+  input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
+  T* input_tmp_data = dev_ctx.Alloc<T>(&input_tmp);
+  phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp);
+
+  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
+  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
+
+  for (int64_t i = 0; i < num_rows; ++i) {
+    T* begin = input_tmp_data + num_cols * i;
+    T* end = input_tmp_data + num_cols * (i + 1);
+    thrust::device_vector<int64_t> indices_data(num_cols);
+    thrust::sequence(
+        thrust::device, indices_data.begin(), indices_data.begin() + num_cols);
+    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
+    int unique = 1 + thrust::inner_product(thrust::device,
+                                           begin,
+                                           end - 1,
+                                           begin + 1,
+                                           0,
+                                           thrust::plus<int>(),
+                                           thrust::not_equal_to<T>());
+    thrust::device_vector<T> keys_data(unique);
+    thrust::device_vector<int64_t> cnts_data(unique);
+    thrust::reduce_by_key(thrust::device,
+                          begin,
+                          end,
+                          thrust::constant_iterator<int>(1),
+                          keys_data.begin(),
+                          cnts_data.begin());
+    auto it = thrust::max_element(
+        thrust::device, cnts_data.begin(), cnts_data.begin() + unique);
+    T mode = keys_data[it - cnts_data.begin()];
+    int64_t counts = cnts_data[it - cnts_data.begin()];
+    auto pos = thrust::find(thrust::device, begin, end, mode);
+    int64_t index = indices_data[pos - begin + counts - 1];
+    out_tensor_ptr[i] = static_cast<T>(mode);
+    indices_tensor_ptr[i] = static_cast<int64_t>(index);
+  }
+}
+#endif
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
index 4cf5e1c02c597..417c1cd234754 100644
--- a/paddle/phi/kernels/funcs/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -392,7 +392,7 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
   int nthreads = batch_size * output_channels * output_height * output_width;
   int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-  // paddle::platform::ChangeThreadNum(context, &thread_num);
+  // backends::gpu::ChangeThreadNum(context, &thread_num);
   thread_num = 512;
 #endif
   int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -460,7 +460,7 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -527,7 +527,7 @@ class Pool2dFunctor<phi::GPUContext, PoolProcess, T> {
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -1293,7 +1293,7 @@ class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -1369,7 +1369,7 @@ class Pool3dFunctor<phi::GPUContext, PoolProcess, T> {
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
     int blocks = (nthreads + thread_num - 1) / thread_num;
     dim3 threads(thread_num, 1);
@@ -1906,7 +1906,7 @@ class MaxPool2dWithIndexFunctor<phi::GPUContext, T1, T2> {
     int nthreads = batch_size * output_channels * output_height * output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
 
     int blocks = (nthreads + thread_num - 1) / thread_num;
@@ -2205,7 +2205,7 @@ class MaxPool3dWithIndexFunctor<phi::GPUContext, T1, T2> {
                    output_width;
     int thread_num = 1024;
 #ifdef WITH_NV_JETSON
-    paddle::platform::ChangeThreadNum(context, &thread_num);
+    backends::gpu::ChangeThreadNum(context, &thread_num);
 #endif
 
     int blocks = (nthreads + thread_num - 1) / thread_num;
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index 19c6d52c4c901..fa285dc69d1ca 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -43,7 +43,7 @@ template <class T>
 class MaxPool {
  public:
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
-  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
+  HOSTDEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
   DEVICE inline void finalize(const T& pool_field, T* y) {}
 };
 
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 5834f091d9a4d..85c371e9f9d45 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// CUDA, XPU and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
@@ -220,7 +220,7 @@ struct IndexCalculator {
   phi::Array<int, kMaxRank> dims;
   phi::Array<int, kMaxRank> strides;
   phi::Array<int, kMaxRank> reduce_strides;
-#ifndef PADDLE_WITH_XPU2
+#ifndef PADDLE_WITH_XPU_KP
   phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
 #endif
 };
@@ -231,81 +231,65 @@ struct ReduceIndexMapping {
   HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
       : dim(dims) {}
 
+#ifdef PADDLE_WITH_XPU_KP
   __device__ __forceinline__ int BlockIdX() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return (cluster_id() / dim.split_num_x % dim.split_num_y);
     } else {
       return cluster_id() % dim.split_num_x;
     }
-#else
-    return blockIdx.x;
-#endif
   }
 
   __device__ __forceinline__ int BlockIdY() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return (cluster_id() % dim.split_num_x);
     } else {
       return (cluster_id() / dim.split_num_x % dim.split_num_y);
     }
-#else
-    return blockIdx.y;
-#endif
   }
 
-  __device__ __forceinline__ int BlockDimX() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_x;
-#else
-    return blockDim.x;
-#endif
-  }
+  __device__ __forceinline__ int BlockDimX() { return dim.deal_size_x; }
 
-  __device__ __forceinline__ int BlockDimY() {
-#ifdef PADDLE_WITH_XPU2
-    return 1;
-#else
-    return blockDim.y;
-#endif
-  }
+  __device__ __forceinline__ int BlockDimY() { return 1; }
 
   __device__ __forceinline__ int GridDimX() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.split_num_y;
     } else {
       return dim.split_num_x;
     }
-#else
-    return gridDim.x;
-#endif
   }
 
   __device__ __forceinline__ int GridDimY() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.split_num_x;
     } else {
       return dim.split_num_y;
     }
-#else
-    return gridDim.y;
-#endif
   }
 
   __device__ __forceinline__ int GetLoopSize() {
-#ifdef PADDLE_WITH_XPU2
     if (ReduceLastDim) {
       return dim.deal_size_y;
     } else {
       return dim.deal_size_x;
     }
+  }
 #else
-    return 1;
+  __device__ __forceinline__ int BlockIdX() { return blockIdx.x; }
+
+  __device__ __forceinline__ int BlockIdY() { return blockIdx.y; }
+
+  __device__ __forceinline__ int BlockDimX() { return blockDim.x; }
+
+  __device__ __forceinline__ int BlockDimY() { return blockDim.y; }
+
+  __device__ __forceinline__ int GridDimX() { return gridDim.x; }
+
+  __device__ __forceinline__ int GridDimY() { return gridDim.y; }
+
+  __device__ int GetLoopSize() { return 1; }
 #endif
-  }
 };
 
 // when reduce_type == kReduceLastDim this struct will be used
@@ -341,7 +325,7 @@ struct ReduceConfig {
 
   // when should_reduce_again is true, we need malloc temp space for temp data
   void SetOutputData(Ty* y_data,
-                     const phi::GPUContext& dev_ctx,
+                     const KPDevice& dev_ctx,
                      phi::DenseTensor* tmp) {
     if (should_reduce_again) {
       tmp->Resize(phi::make_ddim(
@@ -640,9 +624,7 @@ struct ReduceConfig {
   int blocking_size;
   bool should_reduce_again;
   bool reduce_last_dim;
-
   Ty* output_data;
-
   dim3 block;
   dim3 grid;
 };
@@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x,
 
     kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
         &reduce_var, &reduce_var, reducer, reduce_last_dim);
-    if (need_store) {
-      y[store_offset + i] = static_cast<Ty>(reduce_var);
-    }
+
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::details::WriteData<Ty>(
+        y + store_offset + i, &result, static_cast<int>(need_store));
   }
 }
 
@@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data,
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    OneDimIndexCal><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
+    auto grid_num = 8;
+    auto block_num = 64;
 #else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
+                    OneDimIndexCal><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         reduce_index_calculator,
         left_index_calculator,
         dim);
-#endif
 
   } else {
     int reduce_rank = config.reduce_strides.size();
@@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data,
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceAnyKernel<Tx,
-                    Ty,
-                    MPType,
-                    ReduceOp,
-                    TransformOp,
-                    IndexCalculator><<<8, 64, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
+    auto grid_num = 8;
+    auto block_num = 64;
 #else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
+                    IndexCalculator><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         reduce_index_calculator,
         left_index_calculator,
         dim);
-#endif
   }
 
   if (config.should_reduce_again) {
@@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data,
         kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
     dim.SetRem(config.left_num % block.x, 0, 0);
 #ifdef PADDLE_WITH_XPU_KP
-    ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-        config.output_data,
-        y_data,
-        reducer,
-        kps::IdentityFunctor<Ty, MPType>(),
-        init,
-        config.grid.y,
-        config.left_num,
-        config.grid.y,
-        dim);
-#else
+    grid = 8;
+    block = 64;
+#endif
     ReduceHigherDimKernel<
         Ty,
         Ty,
@@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data,
         config.left_num,
         config.grid.y,
         dim);
-#endif
   }
 }
 
@@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     Ty* y_data,
                     const TransformOp& transform,
                     int reduce_num,
-                    const phi::GPUContext& dev_ctx,
+                    const KPDevice& dev_ctx,
                     KPStream stream) {
   auto reducer = ReduceOp<Ty>();
   cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
@@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     Ty* y_data,
                     const TransformOp& transform,
                     int reduce_num,
-                    const phi::GPUContext& dev_ctx,
+                    const KPDevice& dev_ctx,
                     KPStream stream) {
   PADDLE_THROW(phi::errors::InvalidArgument(
       "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
@@ -1087,12 +1030,16 @@ template <typename Tx,
           typename Ty,
           template <typename> class ReduceOp,
           typename TransformOp>
-void ReduceKernel(const phi::GPUContext& dev_ctx,
+void ReduceKernel(const KPDevice& dev_ctx,
                   const phi::DenseTensor& x,
                   phi::DenseTensor* y,
                   const TransformOp& transform,
                   const std::vector<int>& origin_reduce_dims) {
+#ifdef PADDLE_WITH_XPU_KP
+  auto stream = dev_ctx.x_context()->xpu_stream;
+#else
   auto stream = dev_ctx.stream();
+#endif
   dev_ctx.Alloc<Ty>(y);
 
   auto x_dim = phi::vectorize<int>(x.dims());
@@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
                0);
 
 #ifdef PADDLE_WITH_XPU_KP
+    auto grid_num = 8;
+    auto block_num = 64;
+#else
+    auto grid_num = config.grid;
+    auto block_num = config.block;
+#endif
     ReduceHigherDimKernel<Tx,
                           Ty,
                           MPType,
                           ReduceOp<MPType>,
-                          TransformOp><<<8, 64, 0, stream>>>(
+                          TransformOp><<<grid_num, block_num, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
         config.left_num,
         config.blocking_size,
         dim);
-#else
-    ReduceHigherDimKernel<
-        Tx,
-        Ty,
-        MPType,
-        ReduceOp<MPType>,
-        TransformOp><<<config.grid, config.block, 0, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        transform,
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim);
-#endif
 
     if (config.should_reduce_again) {
       dim3 block = dim3(config.block.x, 1, 1);
@@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
       dim2.SetRem(config.left_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU_KP
-      ReduceHigherDimKernel<
-          Ty,
-          Ty,
-          MPType,
-          ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
-          config.output_data,
-          y_data,
-          reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y),
-          reducer.initial(),
-          config.grid.y,
-          config.left_num,
-          config.grid.y,
-          dim2);
-#else
+      grid = 8;
+      block = 64;
+#endif
       ReduceHigherDimKernel<
           Ty,
           Ty,
@@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
           config.left_num,
           config.grid.y,
           dim2);
-#endif
     }
     return;
   }
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index 4e83d0fa37103..9bf1bfecabbf2 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -17,11 +17,39 @@
 namespace phi {
 namespace funcs {
 
-//////// Sum Functor ///////
-struct SumFunctor {
+//////// Frobenius Norm Functor ///////
+struct FrobeniusNormFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->sum(dim);
+    y->device(place) = ((x->square()).sum(dim)).sqrt();
+  }
+};
+
+struct FrobeniusNormGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = y->broadcast(dim);
+    dx->device(place) = *dx + dx->constant(1e-12f);
+    dx->device(place) = (*x / *dx) * (dy->broadcast(dim));
+  }
+};
+
+//////// Max Functor ///////
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
   }
 };
 
@@ -41,11 +69,112 @@ struct ProdFunctor {
   }
 };
 
-//////// Max Functor ///////
-struct MaxFunctor {
+//////// Sum Functor ///////
+struct SumFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->maximum(dim);
+    y->device(place) = x->sum(dim);
+  }
+};
+
+//////// Min Functor ///////
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
+  }
+};
+
+//////// All Functor ///////
+struct AllFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->all(dim);
+  }
+};
+
+//////// Any Functor ///////
+struct AnyFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->any(dim);
+  }
+};
+
+struct MeanGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
+  }
+};
+
+struct SumGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+
+struct ProdGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+  }
+};
+
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h
index 3488b6f2f86b2..11197a52261d7 100644
--- a/paddle/phi/kernels/funcs/reduce_grad_functions.h
+++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h
@@ -41,14 +41,14 @@ void ReduceGradFunctor(const Context& dev_ctx,
   Eigen::array<int, D> broadcast_dim;
   for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
 
-  int broad_cats_times = 1;
+  int broad_cast_times = 1;
   for (size_t i = 0; i < dims_ref.size(); ++i) {
     if (dims_ref[i] < 0) {
       dims_ref[i] = x_rank + dims_ref[i];
     }
     reduced_dims_v[dims_ref[i]] = 1;
     broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
-    broad_cats_times *= x_dims[dims_ref[i]];
+    broad_cast_times *= x_dims[dims_ref[i]];
   }
   auto reduced_dims = phi::make_ddim(reduced_dims_v);
   auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
@@ -62,7 +62,7 @@ void ReduceGradFunctor(const Context& dev_ctx,
           &x_grad,
           &x_reduce_grad,
           broadcast_dim,
-          broad_cats_times);
+          broad_cast_times);
 }
 
 inline void GetOriginDimFromShuffled(const DDim& src_dim,
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
index bf4a21f37223d..fbd744430aa11 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -149,10 +149,19 @@ template class SegmentPoolFunctor<CPU, float, int>;
 template class SegmentPoolFunctor<CPU, float, int64_t>;
 template class SegmentPoolFunctor<CPU, double, int>;
 template class SegmentPoolFunctor<CPU, double, int64_t>;
+template class SegmentPoolFunctor<CPU, int, int>;
+template class SegmentPoolFunctor<CPU, int, int64_t>;
+template class SegmentPoolFunctor<CPU, int64_t, int>;
+template class SegmentPoolFunctor<CPU, int64_t, int64_t>;
+
 template class SegmentPoolGradFunctor<CPU, float, int>;
 template class SegmentPoolGradFunctor<CPU, float, int64_t>;
 template class SegmentPoolGradFunctor<CPU, double, int>;
 template class SegmentPoolGradFunctor<CPU, double, int64_t>;
+template class SegmentPoolGradFunctor<CPU, int, int>;
+template class SegmentPoolGradFunctor<CPU, int, int64_t>;
+template class SegmentPoolGradFunctor<CPU, int64_t, int>;
+template class SegmentPoolGradFunctor<CPU, int64_t, int64_t>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index 305cd39f077bc..95606b1526729 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -453,10 +453,19 @@ template class SegmentPoolFunctor<GPU, float, int>;
 template class SegmentPoolFunctor<GPU, float, int64_t>;
 template class SegmentPoolFunctor<GPU, double, int>;
 template class SegmentPoolFunctor<GPU, double, int64_t>;
+template class SegmentPoolFunctor<GPU, int, int>;
+template class SegmentPoolFunctor<GPU, int, int64_t>;
+template class SegmentPoolFunctor<GPU, int64_t, int>;
+template class SegmentPoolFunctor<GPU, int64_t, int64_t>;
+
 template class SegmentPoolGradFunctor<GPU, float, int>;
 template class SegmentPoolGradFunctor<GPU, float, int64_t>;
 template class SegmentPoolGradFunctor<GPU, double, int>;
 template class SegmentPoolGradFunctor<GPU, double, int64_t>;
+template class SegmentPoolGradFunctor<GPU, int, int>;
+template class SegmentPoolGradFunctor<GPU, int, int64_t>;
+template class SegmentPoolGradFunctor<GPU, int64_t, int>;
+template class SegmentPoolGradFunctor<GPU, int64_t, int64_t>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
new file mode 100644
index 0000000000000..16e00414ad772
--- /dev/null
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -0,0 +1,476 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include <algorithm>
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace kps = phi::kps;
+
+namespace phi {
+namespace funcs {
+using Mode = kps::details::ReduceMode;
+
+/*
+* Count how many of the data being processed by the current block are true
+* 1. Load data from global memory and cast from bool to int64_t
+* 2. Get result of this thread according to thread reduce
+* 3. Get result of this block according to block reduce
+* 4. first block store 0 and current result
+*/
+template <typename T>
+struct NonZeroFunctor {
+  HOSTDEVICE NonZeroFunctor() {}
+  HOSTDEVICE inline T operator()(const T in) {
+    if (in) {
+      return static_cast<T>(1);
+    } else {
+      return static_cast<T>(0);
+    }
+  }
+};
+
+template <typename InT, typename OutT, int VecSize, int IsBoundary>
+__device__ void GetBlockCountImpl(const InT *in,
+                                  OutT *out,
+                                  int num,
+                                  int repeat) {
+  InT in_data[VecSize];
+  OutT temp[VecSize];
+  OutT result = static_cast<OutT>(0.0f);
+  using Add = kps::AddFunctor<OutT>;
+  using Cast = NonZeroFunctor<InT>;
+  int store_fix = BLOCK_ID_X + repeat * GRID_NUM_X;
+
+  kps::Init<InT, VecSize>(&in_data[0], static_cast<InT>(0.0f));
+  kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
+  kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Cast>(
+      &temp[0], &in_data[0], Cast());
+  kps::Reduce<OutT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+      &result, &temp[0], Add(), true);
+  kps::Reduce<OutT, 1, 1, 1, Add, Mode::kGlobalMode>(
+      &result, &result, Add(), true);
+  if (store_fix == 0) {
+    // first block's fix_size = 0;
+    OutT tmp = static_cast<OutT>(0.0f);
+    kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix, &tmp, 1);
+  }
+
+  // store num of this block
+  kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix + 1, &result, 1);
+}
+
+// Count how many data is not zero in current block
+template <typename InT, typename OutT, int VecSize>
+__global__ void GetBlockCountKernel(const InT *in,
+                                    OutT *out,
+                                    int64_t numel,
+                                    int64_t main_offset) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int repeat = 0;
+  for (; data_offset < main_offset; data_offset += stride) {
+    GetBlockCountImpl<InT, OutT, VecSize, false>(
+        in + data_offset, out, BLOCK_NUM_X * VecSize, repeat);
+    repeat++;  // to get the real blockIdx
+  }
+
+  int num = numel - data_offset;
+  if (num > 0) {
+    GetBlockCountImpl<InT, OutT, VecSize, true>(
+        in + data_offset, out, num, repeat);
+  }
+}
+
+/*
+* Get block num prefix us one block, VecSize must be 2
+* 1. Each thread load 2 data : threadIdx.x and threadIdx.x + blockDimx.x
+* 2. Cumsum limitation is blockDim.x must be less than 512
+*/
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          bool IsBoundary>
+__device__ void CumsumImpl(
+    const InT *in, OutT *out, OutT *pre_cumsum, int num, Functor func) {
+  __shared__ OutT max_thread_data;
+  OutT temp[VecSize];
+  InT arg[VecSize];
+  OutT result[VecSize];
+  // init data_pr
+  kps::Init<InT, VecSize>(&arg[0], static_cast<InT>(0.0f));
+  // set pre_cumsum
+  kps::Init<OutT, VecSize>(&temp[0], *pre_cumsum);
+  // load data to arg
+  kps::ReadData<InT, InT, VecSize, 1, 1, IsBoundary>(
+      &arg[0], in, num, 1, BLOCK_NUM_X, 1);
+  // block cumsum
+  kps::Cumsum<InT, OutT, 1, Functor>(&result[0], &arg[0], func);
+  // result = cumsum_result + pre_cumsum
+  kps::ElementwiseBinary<OutT, OutT, VecSize, 1, 1, Functor>(
+      &result[0], &result[0], &temp[0], func);
+  // get the last prefix sum
+  if ((THREAD_ID_X == BLOCK_NUM_X - 1) && !IsBoundary) {
+    max_thread_data = result[VecSize - 1];
+  }
+  __syncthreads();
+  // update pre_cumsum
+  *pre_cumsum = max_thread_data;
+  kps::WriteData<OutT, OutT, VecSize, 1, 1, IsBoundary>(
+      out, &result[0], num, 1, BLOCK_NUM_X, 1);
+}
+
+// Compute this store_offset of this block
+template <typename InT, typename OutT, typename Functor, int VecSize>
+__global__ void CumsumOneBlock(
+    const InT *in, OutT *out, int numel, int main_offset, Functor func) {
+  int stride = BLOCK_NUM_X * VecSize;
+  int offset = 0;
+  OutT pre_cumsum = static_cast<OutT>(0);
+  for (; offset < main_offset; offset += stride) {
+    CumsumImpl<InT, OutT, Functor, VecSize, false>(
+        in + offset, out + offset, &pre_cumsum, BLOCK_NUM_X * VecSize, func);
+  }
+
+  int num = numel - offset;
+  if (num > 0) {
+    CumsumImpl<InT, OutT, Functor, VecSize, true>(
+        in + offset, out + offset, &pre_cumsum, num, func);
+  }
+}
+
+// where_index
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary,
+          int MaskData>
+struct SelectCaller {
+  __device__ void inline operator()(OutT *out,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int data_offset,
+                                    int store_num,
+                                    int thread_fix,
+                                    int num) {
+    int64_t in_data[VecSize];
+    OutT store_data[VecSize * phi::DDim::kMaxRank];
+    // set index
+    kps::InitWithDataIndex<int64_t, VecSize, 1, 1>(&in_data[0], data_offset);
+    // Get store data according to mask_idt
+    kps::OperatorTernary<MT, int64_t, OutT, Functor>(
+        store_data, mask_data, &in_data[0], func, VecSize);
+    kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
+  }
+};
+
+// masked_select
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary>
+struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 1> {
+  __device__ void inline operator()(OutT *out,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int data_offset,
+                                    int store_num,
+                                    int thread_fix,
+                                    int num) {
+    InT in_data[VecSize];
+    OutT store_data[VecSize * phi::DDim::kMaxRank];
+    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
+    // Get store data according to mask_idt
+    kps::OperatorTernary<MT, InT, OutT, Functor>(
+        store_data, mask_data, &in_data[0], func, VecSize);
+    kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
+  }
+};
+
+// masked_select_grad
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary>
+struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
+  __device__ void inline operator()(OutT *out,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int data_offset,
+                                    int store_num,
+                                    int thread_fix,
+                                    int num) {
+    InT in_data[VecSize];
+    OutT store_data[VecSize * phi::DDim::kMaxRank];
+    kps::details::ReadData<InT>(&in_data[0], in + thread_fix, store_num);
+    kps::OperatorTernary<MT, InT, OutT, Functor>(
+        store_data, mask_data, &in_data[0], func, VecSize);
+    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(out, &store_data[0], num);
+  }
+};
+
+/**
+* Get mask's index if mask == true
+*/
+template <typename InT,
+          typename MT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int MaskData,
+          int IsBoundary>  // SelectType = 1 Mask_select else where_index
+__device__ void
+SelectKernelImpl(OutT *out,
+                 const MT *mask,
+                 const InT *in,
+                 Functor func,
+                 int num,
+                 int data_offset,
+                 int store_rank) {
+  const int kCVecSize = 2;
+  // each thread cumsum 2 data
+  using IdT = int64_t;
+  // Set index data type
+  using Add = kps::AddFunctor<IdT>;  // for cumsum
+  using Cast = NonZeroFunctor<InT>;  // for mask
+
+  IdT init_idx = static_cast<IdT>(0.0f);
+  MT init_mask = static_cast<MT>(0.0f);
+
+  IdT num_thread[kCVecSize];
+  IdT cumsum_thread[kCVecSize];
+
+  MT mask_data[VecSize];
+  IdT mask_idt[VecSize];
+  // init data_pr
+  kps::Init<IdT, kCVecSize>(&cumsum_thread[0], init_idx);
+  kps::Init<IdT, kCVecSize>(&num_thread[0], init_idx);
+  kps::Init<MT, VecSize>(&mask_data[0], init_mask);
+  // Load mask
+  kps::ReadData<MT, VecSize, 1, 1, IsBoundary>(&mask_data[0], mask, num);
+  // Cast from MT to int
+  kps::ElementwiseUnary<MT, IdT, VecSize, 1, 1, Cast>(
+      &mask_idt[0], &mask_data[0], Cast());
+  // Get the num of thread only num_thread[1] has data
+  kps::Reduce<IdT, VecSize, 1, 1, Add, Mode::kLocalMode>(
+      &num_thread[0], &mask_idt[0], Add(), true);
+  // Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
+  // thread_fix
+  kps::Cumsum<IdT, IdT, 1, Add>(&cumsum_thread[0], &num_thread[0], Add());
+  // get thread_fix
+  int thread_fix =
+      (static_cast<int>(cumsum_thread[0] - num_thread[0]) * store_rank);
+  // get how many data need to store
+  int store_num = static_cast<int>(num_thread[0]) * store_rank;
+  // thread store num data, each thread may has different num
+  // Get store data(index) according to mask_idt
+  SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, MaskData> select;
+  select(out, mask_data, in, func, data_offset, store_num, thread_fix, num);
+}
+
+template <typename MT,
+          typename InT,
+          typename CT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int MaskData>
+__global__ void SelectKernel(OutT *out,
+                             const MT *mask,
+                             const InT *in,
+                             CT *cumsum,
+                             Functor func,
+                             const int64_t numel,
+                             int64_t main_offset,
+                             int store_rank) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int repeat = 0;
+  int size = VecSize * BLOCK_ID_X;
+  CT block_store_offset = 0;
+  for (; data_offset < main_offset; data_offset += stride) {
+    // Cumsum index
+    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
+    kps::details::ReadData<CT>(&block_store_offset, cumsum + idx_cumsum, 1);
+    int out_fix = MaskData < 2 ? block_store_offset * store_rank : data_offset;
+    int in_fix = MaskData < 2 ? data_offset : block_store_offset * store_rank;
+    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, false>(
+        out + out_fix,
+        mask + data_offset,
+        in + in_fix,
+        func,
+        size,
+        data_offset,
+        store_rank);
+    repeat++;
+  }
+
+  int num = numel - data_offset;
+  if (num > 0) {
+    // Cumsum index
+    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
+    kps::details::ReadData<CT>(&block_store_offset, cumsum + idx_cumsum, 1);
+    int out_fix = MaskData < 2 ? block_store_offset * store_rank : data_offset;
+    int in_fix = MaskData < 2 ? data_offset : block_store_offset * store_rank;
+    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, true>(
+        out + out_fix,
+        mask + data_offset,
+        in + in_fix,
+        func,
+        num,
+        data_offset,
+        store_rank);
+  }
+}
+
+inline int64_t Floor(int64_t in, int64_t div) { return in / div * div; }
+
+// SelectData = 1 then masked_select; SelectData = 0 then where_index
+template <typename MT,
+          typename InT,
+          typename OutT,
+          int SelectData,
+          typename Functor>
+void SelectKernel(const KPDevice &dev_ctx,
+                  const DenseTensor &condition,
+                  const DenseTensor &in_data,
+                  DenseTensor *out,
+                  Functor func) {
+  const MT *cond_data = condition.data<MT>();
+  const int64_t numel = condition.numel();
+  auto dims = condition.dims();
+  int rank = SelectData ? 1 : dims.size();
+  const InT *in_data_ptr = SelectData ? in_data.data<InT>() : nullptr;
+  // calculate the inclusive prefix sum of "true_num_array"
+  // to get the index of "out" tensor,
+  // and the total number of cond_data[i]==true.
+  // Example:
+  // condition: F T T F F F T T
+  // before:    0 1 1 0 0 0 1 1
+  // after:     0 1 2 2 2 2 3 4
+  // out:       1 2 6 7
+  // alloc for cpu
+  using CT = int64_t;  // set Count_data Type
+  const int t_size = sizeof(CT);
+
+  const paddle::platform::CUDAPlace &cuda_place = dev_ctx.GetPlace();
+  paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
+
+  // 1.1 get stored data num of per block
+  int total_true_num = 0;  // init
+  const int kVecSize = 4;
+#ifdef PADDLE_WITH_XPU_KP
+  int block = 64;
+  auto stream = dev_ctx.x_context()->xpu_stream;
+  const int num_per_block = kVecSize * block;
+  const int need_grids = (numel + num_per_block - 1) / num_per_block;
+  const int grid = std::min(need_grids, 8);
+#else
+  const int block = 256;
+  const int num_per_block = kVecSize * block;
+  const int need_grids = (numel + num_per_block - 1) / num_per_block;
+  const int grid = std::min(need_grids, 256);
+  auto stream = dev_ctx.stream();
+#endif
+  const int64_t main_offset = Floor(numel, num_per_block);
+  // 1.2 alloc tmp data for CoutBlock
+  const int size_count_block = need_grids + 1;
+  std::vector<int> dims_vec = {size_count_block * 2};
+  ScalarArray dims_array(dims_vec);
+  DenseTensor count_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
+  CT *count_data = count_mem.data<CT>();
+  // 1.3 launch CountKernl
+  GetBlockCountKernel<MT, CT, kVecSize><<<grid, block, 0, stream>>>(
+      cond_data, count_data, numel, main_offset);
+  // 2.1 alloc cumsum data for CoutBlock prefix
+  DenseTensor cumsum_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
+  CT *cumsum_data = cumsum_mem.data<CT>();
+  // 2.2 get prefix of count_data for real out_index
+  const int kCumVesize = 2;
+  const int block_c = 256;
+  const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c));
+
+  using Add = kps::AddFunctor<CT>;
+  CumsumOneBlock<CT, CT, Add, kCumVesize><<<1, block_c, 0, stream>>>(
+      count_data, cumsum_data, size_count_block, main_offset_c, Add());
+  // 3.1 set temp ptr for in;
+  // 3.1 alloc for out
+  // 3.1.1 get true_num for gpu place the last cumsum is the true_num
+  paddle::memory::Copy(cpu_place,
+                       &total_true_num,
+                       cuda_place,
+                       cumsum_data + need_grids,
+                       t_size,
+                       dev_ctx.stream());
+
+  dev_ctx.Wait();
+  // 3.1.2 allock for out with total_true_num
+  std::vector<int64_t> out_dim = {static_cast<int64_t>(total_true_num)};
+
+  if (SelectData == 1) {
+    out->Resize(phi::make_ddim(out_dim));
+  } else if (SelectData == 0) {  // == 0 where_index
+    out_dim.push_back(rank);
+    out->Resize(phi::make_ddim(out_dim));
+  }
+  auto out_data = out->mutable_data<OutT>(cuda_place);
+  // 3.2 get true data's index according to cond_data and cumsum_data
+  if (total_true_num <= 0) return;
+  SelectKernel<MT,
+               InT,
+               CT,
+               OutT,
+               Functor,
+               kVecSize,
+               SelectData><<<grid, block, 0, stream>>>(out_data,
+                                                       cond_data,
+                                                       in_data_ptr,
+                                                       cumsum_data,
+                                                       func,
+                                                       numel,
+                                                       main_offset,
+                                                       rank);
+}
+
+}  // namespace funcs
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/kernels/funcs/slice.h b/paddle/phi/kernels/funcs/slice.h
index 0a50dceb0a007..38b127541650b 100644
--- a/paddle/phi/kernels/funcs/slice.h
+++ b/paddle/phi/kernels/funcs/slice.h
@@ -123,5 +123,56 @@ DenseTensor Slice(const Context& dev_ctx,
   return ret;
 }
 
+// Use in conv_transpose kernel
+template <typename Context, typename T, size_t D>
+static void Slice(const Context& ctx,
+                  const DenseTensor* input,
+                  DenseTensor* out,
+                  const std::vector<int64_t>& begin_vec,
+                  const std::vector<int64_t>& end_vec,
+                  const std::vector<int64_t>& axes_vec) {
+  auto& place = *ctx.eigen_device();
+  auto in_dims = input->dims();
+  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
+  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
+  for (size_t i = 0; i < D; ++i) {
+    offsets[i] = 0;
+    extents[i] = in_dims[i];
+  }
+
+  std::vector<int64_t> out_shape_vec = vectorize(in_dims);
+  for (size_t i = 0; i < axes_vec.size(); ++i) {
+    offsets[axes_vec[i]] = begin_vec[i];
+    extents[axes_vec[i]] = end_vec[i] - begin_vec[i];
+    out_shape_vec[axes_vec[i]] = end_vec[i] - begin_vec[i];
+  }
+
+  DDim out_dims(make_ddim(out_shape_vec));
+  out->Resize(out_dims);
+  ctx.template Alloc<T>(out);
+
+  auto in_t =
+      EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(*input);
+  auto out_t = EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+      *out, out_dims);
+
+  funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
+      place, out_t, in_t, offsets, extents);
+  out->Resize(out_dims);
+}
+
+template <typename Context, typename T, size_t D>
+static void Slice(const Context& ctx,
+                  const DenseTensor* input,
+                  DenseTensor* out,
+                  int64_t begin_idx,
+                  int64_t end_idx,
+                  int64_t axes) {
+  std::vector<int64_t> begin_vec = {begin_idx};
+  std::vector<int64_t> end_vec = {end_idx};
+  std::vector<int64_t> axes_vec = {axes};
+  Slice<Context, T, D>(ctx, input, out, begin_vec, end_vec, axes_vec);
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h
index 68fe8880a971d..19f1f3d3cd2fa 100644
--- a/paddle/phi/kernels/funcs/sparse/convolution.h
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -93,7 +93,7 @@ inline HOSTDEVICE void IndexToPoint(
 }
 
 inline void GetOutShape(const DDim& x_dims,
-                        const DDim& kernel_dims,
+                        const std::vector<int>& kernel_sizes,
                         const std::vector<int>& paddings,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
@@ -102,17 +102,17 @@ inline void GetOutShape(const DDim& x_dims,
       x_dims.size(),
       5,
       phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
-  PADDLE_ENFORCE_EQ(kernel_dims.size(),
+  PADDLE_ENFORCE_EQ(kernel_sizes.size(),
                     5,
                     phi::errors::InvalidArgument(
                         "the shape of kernel should be (D, H, W, C, OC)"));
 
   // infer out shape
   (*out_dims)[0] = x_dims[0];
-  (*out_dims)[4] = kernel_dims[4];
+  (*out_dims)[4] = kernel_sizes[4];
   for (int i = 1; i < 4; i++) {
     (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
-                      dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) /
+                      dilations[i - 1] * (kernel_sizes[i - 1] - 1) - 1) /
                          strides[i - 1] +
                      1;
   }
@@ -131,7 +131,7 @@ template <typename T, typename Context>
 inline void SubmPreProcess(const Context& dev_ctx,
                            const SparseCooTensor& x,
                            const DenseTensor& kernel,
-                           const SparseCooTensor& out_grad,
+                           const DenseTensor& out_grad,
                            const int in_channels,
                            const int out_channels,
                            const int half_kernel_size,
@@ -142,11 +142,11 @@ inline void SubmPreProcess(const Context& dev_ctx,
   blas.GEMM(CblasTrans,
             CblasNoTrans,
             x.non_zero_elements().dims()[1],
-            out_grad.non_zero_elements().dims()[1],
+            out_grad.dims()[1],
             x.non_zero_elements().dims()[0],
             static_cast<T>(1),
             x.non_zero_elements().data<T>(),
-            out_grad.non_zero_elements().data<T>(),
+            out_grad.data<T>(),
             static_cast<T>(0),
             d_kernel_ptr + half_kernel_size * in_channels * out_channels);
 
@@ -155,16 +155,36 @@ inline void SubmPreProcess(const Context& dev_ctx,
   T* x_grad_ptr = x_grad->data<T>();
   blas.GEMM(CblasNoTrans,
             CblasTrans,
-            out_grad.non_zero_elements().dims()[0],
+            out_grad.dims()[0],
             in_channels,
-            out_grad.non_zero_elements().dims()[1],
+            out_grad.dims()[1],
             static_cast<T>(1),
-            out_grad.non_zero_elements().data<T>(),
+            out_grad.data<T>(),
             kernel.data<T>() + half_kernel_size * in_channels * out_channels,
             static_cast<T>(0),
             x_grad_ptr);
 }
 
+inline const std::vector<int> PoolResetKernel(
+    const std::vector<int>& kernel_sizes,
+    const int in_channels,
+    const int out_channels) {
+  std::vector<int> res(kernel_sizes);
+  res.resize(5);
+  res[3] = in_channels;
+  res[4] = out_channels;
+  return res;
+}
+
+inline void PrefixSum(const int* counter, int* offsets, const int n) {
+  int offset = 0;
+  for (int i = 0; i < n; i++) {
+    offsets[i] = offset;
+    offset += counter[i];
+  }
+  offsets[n] = offset;
+}
+
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/tril_triu_compute.h b/paddle/phi/kernels/funcs/tril_triu_compute.h
new file mode 100644
index 0000000000000..d2b6f1e559d2b
--- /dev/null
+++ b/paddle/phi/kernels/funcs/tril_triu_compute.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class TrilTriuCompute {
+ public:
+  HOSTDEVICE TrilTriuCompute(const T* in,
+                             const int diagonal,
+                             const bool lower,
+                             const int64_t H,
+                             const int64_t W,
+                             T* out)
+      : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {}
+
+  HOSTDEVICE void operator()(int64_t idx) {
+    const int64_t row = (idx / W_) % H_;
+    const int64_t col = idx % W_;
+    const bool mask =
+        lower_ ? (col - row > diagonal_) : (col - row < diagonal_);
+    out_[idx] = mask ? static_cast<T>(0) : in_[idx];
+  }
+
+ private:
+  const T* in_;
+  const int diagonal_;
+  const bool lower_;
+  const int64_t H_;
+  const int64_t W_;
+  T* out_;
+};
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h
index 7b8a81471ef76..2d77c809bf9c9 100644
--- a/paddle/phi/kernels/funcs/unsqueeze.h
+++ b/paddle/phi/kernels/funcs/unsqueeze.h
@@ -21,6 +21,118 @@
 
 namespace phi {
 namespace funcs {
+inline DDim GetOutputSqueezeShape(const std::vector<int> squeeze_dims,
+                                  const DDim& in_dims,
+                                  bool is_runtime) {
+  size_t num_squeeze_dims = squeeze_dims.size();
+  std::vector<bool> should_squeeze(in_dims.size(), false);
+
+  // Mark dimensions need to be squeezed.
+  if (num_squeeze_dims == 0) {
+    for (int i = 0; i < in_dims.size(); ++i) {
+      if (in_dims[i] == 1) {
+        should_squeeze[i] = true;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < num_squeeze_dims; ++i) {
+      int current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size()
+                                        : squeeze_dims[i];
+
+      PADDLE_ENFORCE_GE(
+          current,
+          0,
+          phi::errors::InvalidArgument(
+              "Each axis in Attr(axes) should be in the range of [%d, %d]"
+              "But current axis is:%d, input tensor's shape = [%s].",
+              -in_dims.size(),
+              in_dims.size() - 1,
+              current,
+              in_dims));
+      PADDLE_ENFORCE_LT(
+          current,
+          in_dims.size(),
+          phi::errors::InvalidArgument(
+              "Each axis in Attr(axes) should be in the range of [%d, %d]"
+              "But current axis is:%d, input tensor's shape = [%s].",
+              -in_dims.size(),
+              in_dims.size() - 1,
+              current,
+              in_dims));
+
+      if (!should_squeeze[current]) {
+        if (is_runtime) {
+          // At run time, dim of 1 is allowed to squeeze
+          if (in_dims[current] == 1) {
+            should_squeeze[current] = true;
+          }
+        } else {
+          // At compile time, dim of -1 or 1 is allowed to squeeze
+          if (in_dims[current] == 1 || in_dims[current] == -1) {
+            should_squeeze[current] = true;
+          }
+        }
+      }
+    }
+  }
+  // Make output dimensions
+  std::vector<int64_t> output_shape;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (!should_squeeze[i]) {
+      output_shape.push_back(in_dims[i]);
+    }
+  }
+  return phi::make_ddim(output_shape);
+}
+
+inline DDim GetUnsqueezeShape(const std::vector<int> unsqz_dims,
+                              const DDim& in_dims) {
+  int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
+  int cur_output_size = in_dims.size();
+  std::vector<int64_t> output_shape(output_size, 0);
+
+  // Validity Check: rank range.
+  PADDLE_ENFORCE_LE(
+      output_size,
+      6,
+      phi::errors::InvalidArgument("The output "
+                                   "tensor's rank should be less than 6."));
+
+  for (int axis : unsqz_dims) {
+    int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
+    // Vaildity Check: the axis bound
+    PADDLE_ENFORCE_GE(
+        cur,
+        0,
+        phi::errors::InvalidArgument("The insert dimension value should "
+                                     "not be less than 0"));
+    PADDLE_ENFORCE_LE(cur,
+                      cur_output_size,
+                      phi::errors::InvalidArgument(
+                          "The insert dimension value shoule not be larger "
+                          "than the dimension size of input tensor"));
+    // Move old axis, and insert new axis
+    for (int i = cur_output_size; i >= cur; --i) {
+      if (output_shape[i] == 1) {
+        // Move axis
+        output_shape[i + 1] = 1;
+        output_shape[i] = 0;
+      }
+    }
+    output_shape[cur] = 1;
+    // Add the output size.
+    cur_output_size++;
+  }
+
+  // Make output shape
+  for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
+    if (output_shape[out_idx] == 0) {
+      output_shape[out_idx] = in_dims[in_idx++];
+    }
+  }
+
+  return phi::make_ddim(output_shape);
+}
 
 inline const DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
   // don't copy data, only change the dims
diff --git a/paddle/phi/kernels/gather_grad_kernel.h b/paddle/phi/kernels/gather_grad_kernel.h
new file mode 100644
index 0000000000000..e53da7b471c7b
--- /dev/null
+++ b/paddle/phi/kernels/gather_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& index,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool overwrite,
+                      DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gather_kernel.h b/paddle/phi/kernels/gather_kernel.h
new file mode 100644
index 0000000000000..78ac09125b692
--- /dev/null
+++ b/paddle/phi/kernels/gather_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& index,
+                  const Scalar& axis,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gelu_grad_kernel.h b/paddle/phi/kernels/gelu_grad_kernel.h
new file mode 100644
index 0000000000000..fd70e8d54bc8d
--- /dev/null
+++ b/paddle/phi/kernels/gelu_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    bool approximate,
+                    DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/gelu_kernel.h b/paddle/phi/kernels/gelu_kernel.h
new file mode 100644
index 0000000000000..bc106a04031fb
--- /dev/null
+++ b/paddle/phi/kernels/gelu_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+#define GELU_CONSTANT 0.044715
+
+template <typename T, typename Context>
+void GeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                bool approximate,
+                DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index c2995c79a7e8c..3cc41555a898b 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -73,119 +73,162 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
   }
 }
 
-#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
   template <typename T, typename Context>                           \
   void name##GradKernel(const Context& dev_ctx,                     \
                         const DenseTensor& x,                       \
                         const DenseTensor& dout,                    \
                         DenseTensor* dx) {                          \
-    functor_class functor;                                          \
-    ActivationGradGPUImpl<T, Context, functor_class>(               \
+    funcs::functor_class<T> functor;                                \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>(     \
         dev_ctx, &x, nullptr, &dout, dx, functor);                  \
   }
 
-#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(         \
+    name, functor_class, attr)                                  \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
+
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(         \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        float attr1,                            \
+                        float attr2,                            \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
+
+#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
   template <typename T, typename Context>                             \
   void name##GradKernel(const Context& dev_ctx,                       \
                         const DenseTensor& out,                       \
                         const DenseTensor& dout,                      \
                         DenseTensor* dx) {                            \
-    functor_class functor;                                            \
-    ActivationGradGPUImpl<T, Context, functor_class>(                 \
+    funcs::functor_class<T> functor;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>(       \
         dev_ctx, nullptr, &out, &dout, dx, functor);                  \
   }
 
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor<T>);
-DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor<T>);
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(       \
+    name, functor_class, attr)                                  \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& out,                 \
+                        const DenseTensor& dout,                \
+                        float attr,                             \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);            \
+  }
+
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(       \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& out,                 \
+                        const DenseTensor& dout,                \
+                        float attr1,                            \
+                        float attr2,                            \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, nullptr, &out, &dout, dx, functor);            \
+  }
+
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, CudaSigmoidGradFunctor);
+
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, CudaSinGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, CudaAsinGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, CudaAtanGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, CudaSinhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CudaCoshGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, CudaAsinhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, CudaLog10GradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, CudaLog1pGradFunctor);
+
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
+                                               CudaLeakyReluGradFunctor,
+                                               alpha);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
+                                               CudaThresholdedReluGradFunctor,
+                                               threshold);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
+                                               CudaSoftShrinkGradFunctor,
+                                               lambda);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
+                                               CudaHardShrinkGradFunctor,
+                                               threshold);
+
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
+                                               CudaBReluGradFunctor,
+                                               t_min,
+                                               t_max);
+
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
+                                                 CudaHardSigmoidGradFunctor,
+                                                 slope,
+                                                 offset);
+
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+  std::vector<const DenseTensor*> ins = {&dout, &out};
+  std::vector<DenseTensor*> outs = {dx};
+  if (alpha > 0) {
+    funcs::CudaELUGradFunctor<T> functor;
+    functor.alpha = alpha;
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  } else {
+    funcs::CudaELUGradNegativeAlphaFunctor<T> functor;
+    functor.alpha = alpha;
+    ins.push_back(&x);
+    funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+  }
+}
 
 }  // namespace phi
-PD_REGISTER_KERNEL(cos_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CosGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(tan_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::TanGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acos_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcosGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sin_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SinGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asin_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atan_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sinh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SinhGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(cosh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CoshGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asinh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinhGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acosh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcoshGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atanh_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanhGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
+
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(relu_grad,
                    GPU,
@@ -219,3 +262,56 @@ PD_REGISTER_KERNEL(relu_double_grad,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 #endif
+
+#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name,                             \
+                     GPU,                              \
+                     ALL_LAYOUT,                       \
+                     phi::func,                        \
+                     float,                            \
+                     double,                           \
+                     phi::dtype::float16,              \
+                     phi::dtype::bfloat16) {}
+
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_triple_grad, TanhTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad,
+                                   LeakyReluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
+                                   ThresholdedReluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log_grad, LogGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log2_grad, Log2GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log10_grad, Log10GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log1p_grad, Log1pGradKernel)
+PD_REGISTER_KERNEL(log_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 26752b89e7c34..fb4e2e07b21cb 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+#include "paddle/phi/kernels/impl/activation_impl.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
@@ -38,26 +38,81 @@ void ActivationGPUImpl(const Context& dev_ctx,
   funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
 }
 
-#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class)                   \
-  template <typename T, typename Context>                                   \
-  void name##Kernel(                                                        \
-      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {     \
-    functor_class functor;                                                  \
-    ActivationGPUImpl<T, Context, functor_class>(dev_ctx, x, out, functor); \
+#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class)               \
+  template <typename T, typename Context>                               \
+  void name##Kernel(                                                    \
+      const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \
+    funcs::functor_class<T> functor;                                    \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>(             \
+        dev_ctx, x, out, functor);                                      \
   }
 
-DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor<T>)
-DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor<T>)
+#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \
+  template <typename T, typename Context>                               \
+  void name##Kernel(const Context& dev_ctx,                             \
+                    const DenseTensor& x,                               \
+                    float attr,                                         \
+                    DenseTensor* out) {                                 \
+    funcs::functor_class<T> functor;                                    \
+    auto attrs = functor.GetAttrs();                                    \
+    *(attrs[0].second) = attr;                                          \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>(             \
+        dev_ctx, x, out, functor);                                      \
+  }
+
+#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(               \
+    name, functor_class, attr1, attr2)                      \
+  template <typename T, typename Context>                   \
+  void name##Kernel(const Context& dev_ctx,                 \
+                    const DenseTensor& x,                   \
+                    float attr1,                            \
+                    float attr2,                            \
+                    DenseTensor* out) {                     \
+    funcs::functor_class<T> functor;                        \
+    auto attrs = functor.GetAttrs();                        \
+    *(attrs[0].second) = attr1;                             \
+    *(attrs[1].second) = attr2;                             \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                          \
+  }
+
+DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sin, CudaSinFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Asin, CudaAsinFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Atan, CudaAtanFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sinh, CudaSinhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Cosh, CudaCoshFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Asinh, CudaAsinhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Acosh, CudaAcoshFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Atanh, CudaAtanhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log, CudaLogFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log2, CudaLog2Functor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log10, CudaLog10Functor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log1p, CudaLog1pFunctor)
+
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
+                                     CudaThresholdedReluFunctor,
+                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
+                                     CudaHardShrinkFunctor,
+                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha)
+
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
+                                     CudaHardSigmoidFunctor,
+                                     slope,
+                                     offset)
 
 }  // namespace phi
 
@@ -79,65 +134,41 @@ PD_REGISTER_KERNEL(relu,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 #endif
-PD_REGISTER_KERNEL(
-    sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(
-    tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acos,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcosKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asin,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atan,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(sinh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SinhKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(cosh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CoshKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(asinh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AsinhKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(acosh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AcoshKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(atanh,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AtanhKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {}
+
+#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
+  PD_REGISTER_KERNEL(name,                        \
+                     GPU,                         \
+                     ALL_LAYOUT,                  \
+                     phi::func,                   \
+                     float,                       \
+                     double,                      \
+                     phi::dtype::float16,         \
+                     phi::dtype::bfloat16) {}
+
+PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
+PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
+PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index 2c9ee5ede0103..339c3536d7a7f 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -359,8 +359,8 @@ void BatchNormGradRawKernel(const Context &ctx,
   }
 
   if (d_scale && d_bias) {
-    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    ctx.template Alloc<BatchNormParamType<T>>(d_scale);
+    ctx.template Alloc<BatchNormParamType<T>>(d_bias);
   }
 
   PADDLE_ENFORCE_EQ(
@@ -569,8 +569,8 @@ void BatchNormGradRawKernel(const Context &ctx,
                   /*activationDesc=*/nullptr,
                   /*sizeInBytes=*/&workspace_size));
 
-      workspace_ptr = workspace_tensor.mutable_data(
-          ctx.GetPlace(), transformed_x.type(), workspace_size);
+      workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
+      workspace_ptr = ctx.template Alloc<T>(&workspace_tensor);
 
       PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
@@ -594,12 +594,9 @@ void BatchNormGradRawKernel(const Context &ctx,
               /*dBnScaleBiasDesc=*/bn_param_desc_,
               /*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
               /*bnBiasData=*/nullptr,
-              /*dBnScaleData=*/d_scale
-                  ->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-              /*dBnBiasData=*/d_bias
-                  ->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
+              /*dBnScaleData=*/ctx.template Alloc<BatchNormParamType<T>>(
+                  d_scale),
+              /*dBnBiasData=*/ctx.template Alloc<BatchNormParamType<T>>(d_bias),
               /*epsilon=*/epsilon,
               /*savedMean=*/saved_mean_data,
               /*savedInvVariance=*/saved_var_data,
@@ -626,10 +623,8 @@ void BatchNormGradRawKernel(const Context &ctx,
               H * W * D,
               epsilon,
               transformed_d_x.template data<T>(),
-              d_scale->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              d_bias->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()));
+              ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+              ctx.template Alloc<BatchNormParamType<T>>(d_bias));
         } else {
           BNBackward<T,
                      block,
@@ -644,10 +639,8 @@ void BatchNormGradRawKernel(const Context &ctx,
               H * W * D,
               epsilon,
               transformed_d_x.template data<T>(),
-              d_scale->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              d_bias->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()));
+              ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+              ctx.template Alloc<BatchNormParamType<T>>(d_bias));
         }
 
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
@@ -682,10 +675,8 @@ void BatchNormGradRawKernel(const Context &ctx,
                 ctx.template Alloc<T>(&transformed_d_x),
                 bn_param_desc_,
                 scale.template data<BatchNormParamType<T>>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
+                ctx.template Alloc<BatchNormParamType<T>>(d_scale),
+                ctx.template Alloc<BatchNormParamType<T>>(d_bias),
                 epsilon,
                 saved_mean_data,
                 saved_var_data));
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 49b550f51e60e..74a523f4ecf94 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -439,11 +439,11 @@ void BatchNormKernel(const Context &ctx,
     // Run training mode.
     // obtain running mean and running inv var, and there is no need
     // to initialize them.
-    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    ctx.template Alloc<BatchNormParamType<T>>(mean_out);
+    ctx.template Alloc<BatchNormParamType<T>>(variance_out);
 
-    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+    ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
 
     if ((N * H * W * D) == 1) {
       // Only 1 element in normalization dimension,
@@ -497,10 +497,10 @@ void BatchNormKernel(const Context &ctx,
                   /*xDesc=*/data_desc_,
                   /*sizeInBytes=*/&reserve_space_size));
 
-      reserve_space_ptr = reserve_space->mutable_data(
-          ctx.GetPlace(), transformed_x.type(), reserve_space_size);
-      workspace_ptr = workspace_tensor.mutable_data(
-          ctx.GetPlace(), transformed_x.type(), workspace_size);
+      reserve_space->Resize({static_cast<int64_t>(reserve_space_size)});
+      reserve_space_ptr = ctx.template Alloc<T>(reserve_space);
+      workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
+      workspace_ptr = ctx.template Alloc<T>(&workspace_tensor);
       PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
               handle,
@@ -518,15 +518,11 @@ void BatchNormKernel(const Context &ctx,
               scale.template data<BatchNormParamType<T>>(),
               bias.template data<BatchNormParamType<T>>(),
               this_factor,
-              mean_out->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              variance_out->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
+              ctx.template Alloc<BatchNormParamType<T>>(mean_out),
+              ctx.template Alloc<BatchNormParamType<T>>(variance_out),
               epsilon,
-              saved_mean->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              saved_variance->template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
+              ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
+              ctx.template Alloc<BatchNormParamType<T>>(saved_variance),
               nullptr,
               workspace_ptr,
               workspace_size,
@@ -621,15 +617,11 @@ void BatchNormKernel(const Context &ctx,
                 scale.template data<BatchNormParamType<T>>(),
                 bias.template data<BatchNormParamType<T>>(),
                 this_factor,
-                mean_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                variance_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
+                ctx.template Alloc<BatchNormParamType<T>>(mean_out),
+                ctx.template Alloc<BatchNormParamType<T>>(variance_out),
                 epsilon,
-                saved_mean->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                saved_variance->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace())));
+                ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
+                ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
 #endif
       }
     }
diff --git a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
new file mode 100644
index 0000000000000..e583e13650aeb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
@@ -0,0 +1,157 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h"
+
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Conv2dTransposeDoubleGradKernel(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const DenseTensor& ddx,
+                                     const DenseTensor& ddfilter,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings,
+                                     const std::vector<int>& output_padding,
+                                     const std::vector<int>& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter,
+                                     DenseTensor* ddout) {
+  ConvTransposeGradRawKernel<T, Context>(ctx,
+                                         x,
+                                         filter,
+                                         dout,
+                                         strides,
+                                         paddings,
+                                         padding_algorithm,
+                                         groups,
+                                         dilations,
+                                         data_format,
+                                         dx,
+                                         dfilter);
+}
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
+                                        const DenseTensor& x,
+                                        const DenseTensor& filter,
+                                        const DenseTensor& dout,
+                                        const std::vector<int>& strides,
+                                        const std::vector<int>& paddings,
+                                        const std::vector<int>& output_padding,
+                                        const std::vector<int>& output_size,
+                                        const std::string& padding_algorithm,
+                                        int groups,
+                                        const std::vector<int>& dilations,
+                                        const std::string& data_format,
+                                        DenseTensor* dx,
+                                        DenseTensor* dfilter) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  DenseTensor filter_ = filter;
+
+  if (!dx && !dfilter) {
+    return;
+  }
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  auto x_dims = x.dims();
+  auto filter_dims = filter_.dims();
+
+  DDim in_data_dims;
+  if (data_layout != DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  } else {
+    in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
+
+  if (dx) {
+    paddle::operators::math::DepthwiseConvFunctor<Context, T> depthwiseConv;
+    depthwiseConv(ctx,
+                  dout,
+                  filter_,
+                  strides,
+                  std::vector<int>{
+                      paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
+                  dilations_,
+                  dx,
+                  data_layout);
+  }
+
+  if (dfilter) {
+    funcs::SetConstant<Context, T> set_zero;
+    ctx.template Alloc<T>(dfilter);
+    set_zero(ctx, dfilter, static_cast<T>(0));
+
+    paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T>
+        depthwiseConvFilterGrad;
+    depthwiseConvFilterGrad(
+        ctx,
+        dout,
+        x,
+        strides,
+        std::vector<int>{
+            paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
+        dilations_,
+        dfilter,
+        data_layout);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d_transpose_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(conv2d_transpose_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeDoubleGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(conv3d_transpose_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_transpose_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConv2dTransposeGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
new file mode 100644
index 0000000000000..b7d34a5baf3df
--- /dev/null
+++ b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
+#include "paddle/phi/kernels/impl/conv_transpose_kernel_impl.h"
+
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DepthwiseConv2dTransposeKernel(const Context& ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& filter,
+                                    const std::vector<int>& strides,
+                                    const std::vector<int>& paddings,
+                                    const std::vector<int>& output_padding,
+                                    const std::vector<int>& output_size,
+                                    const std::string& padding_algorithm,
+                                    int groups,
+                                    const std::vector<int>& dilations,
+                                    const std::string& data_format,
+                                    DenseTensor* out) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  DenseTensor filter_ = filter;
+  ctx.template Alloc<T>(out);
+
+  PADDLE_ENFORCE_EQ(
+      groups,
+      filter_.dims()[0],
+      errors::InvalidArgument(
+          "groups should be error to the 1st dimension of filter_. But "
+          "received groups is %d and filter dimension[0] is %d",
+          groups,
+          filter_.dims()[0]));
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  for (auto v : dilations_) {
+    PADDLE_ENFORCE_EQ(
+        v,
+        1,
+        errors::InvalidArgument("dilations should be 1 in depthwise conv. "
+                                "But received dilations is %d",
+                                v));
+  }
+
+  auto x_dims = x.dims();
+  auto filter_dims = filter_.dims();
+
+  DDim in_data_dims;
+  if (data_layout != DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  } else {
+    in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
+
+  ctx.template Alloc<T>(out);
+
+  funcs::SetConstant<Context, T> set_zero;
+  set_zero(ctx, out, static_cast<T>(0));
+
+  paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T>
+      depthwiseConvInputGrad;
+  depthwiseConvInputGrad(
+      ctx,
+      *out,
+      filter,
+      x,
+      strides,
+      std::vector<int>{paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
+      dilations_,
+      out,
+      data_layout);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conv2d_transpose,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(conv3d_transpose,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_transpose,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConv2dTransposeKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu
index 4545f9ce436ea..a16c8369cc9e5 100644
--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -87,7 +87,8 @@ void Copy(const Context& dev_ctx,
                  : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
+  } else if ((paddle::platform::is_cpu_place(src_place) ||
+              paddle::platform::is_cuda_pinned_place(src_place)) &&  // NOLINT
              paddle::platform::is_gpu_place(dst_place)) {
     auto src_cpu_place = src_place;
     auto dst_gpu_place = dst_place;
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
new file mode 100644
index 0000000000000..6e8712462928d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -0,0 +1,320 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_grad_kernel.h"
+
+#include <thrust/transform.h>
+#include "paddle/fluid/operators/math/inclusive_scan.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+// NOTE(@xiongkun): use of IsComplex<>
+#include "paddle/fluid/framework/data_type.h"
+
+namespace phi {
+
+template <typename T>
+struct CumprodGradFunctorExceptFirstZero {
+  HOSTDEVICE CumprodGradFunctorExceptFirstZero(
+      const T *x,
+      const T *y,
+      const T *dy_mul_y_reversed_cumsum,
+      const uint8_t *zero_mask,
+      size_t mid_dim,
+      size_t inner_dim,
+      T *dx,
+      int64_t *first_zero_idx,
+      T *x_filled_one)
+      : x_(x),
+        y_(y),
+        dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum),
+        zero_mask_(zero_mask),
+        mid_dim_(mid_dim),
+        inner_dim_(inner_dim),
+        dx_(dx),
+        first_zero_idx_(first_zero_idx),
+        x_filled_one_(x_filled_one) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    auto inner_idx = idx % inner_dim_;
+    auto outer_idx = idx / (mid_dim_ * inner_dim_);
+    auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_;
+    auto mask = zero_mask_[idx];
+    bool should_fill_one = true;
+
+    if (mask == 0) {
+      dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx];
+      if (mid_idx == mid_dim_ - 1) {
+        // record first zero position as -1, i.e., no zero
+        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1;
+      }
+    } else if (mid_idx > 0) {                  // mask > 0
+      if (zero_mask_[idx - inner_dim_] > 0) {  // not first zero
+        dx_[idx] = 0;
+        should_fill_one = false;
+      } else {
+        // idx is the first zero position, it should be recorded
+        dx_[idx] = y_[idx - inner_dim_];
+        first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx;
+      }
+    } else {  // the first zero position is index 0
+      dx_[idx] = 1;
+      first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0;
+    }
+
+    x_filled_one_[idx] = should_fill_one ? 1 : x_[idx];
+  }
+
+ private:
+  const T *x_;
+  const T *y_;
+  const T *dy_mul_y_reversed_cumsum_;
+  const uint8_t *zero_mask_;
+  size_t mid_dim_;
+  size_t inner_dim_;
+  T *dx_;
+  int64_t *first_zero_idx_;
+  T *x_filled_one_;
+};
+
+template <typename T>
+struct FillFirstZeroPositionGradFunctor {
+  HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx,
+                                              const T *grad_value,
+                                              size_t mid_dim,
+                                              size_t inner_dim,
+                                              T *dx)
+      : first_zero_idx_(first_zero_idx),
+        grad_value_(grad_value),
+        mid_dim_(mid_dim),
+        inner_dim_(inner_dim),
+        dx_(dx) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    auto outer_idx = idx / inner_dim_;
+    auto inner_idx = idx % inner_dim_;
+    auto mid_idx = first_zero_idx_[idx];
+    if (mid_idx >= 0) {
+      auto full_idx =
+          outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx;
+      dx_[full_idx] *= grad_value_[full_idx];
+    }
+  }
+
+ private:
+  const int64_t *first_zero_idx_;
+  const T *grad_value_;
+  size_t mid_dim_;
+  size_t inner_dim_;
+  T *dx_;
+};
+
+template <typename T, typename Context>
+void CumprodGradKernel(const Context &dev_ctx,
+                       const DenseTensor &x,
+                       const DenseTensor &out,
+                       const DenseTensor &dout,
+                       int dim,
+                       DenseTensor *dx) {
+  const auto *y = &out;
+  const auto *dy = &dout;
+
+  size_t outer_dim, mid_dim, inner_dim;
+  GetCumprodDimInfo(x.dims(), dim, &outer_dim, &mid_dim, &inner_dim);
+  if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
+
+  size_t numel = outer_dim * mid_dim * inner_dim;
+
+  const auto *x_data = x.data<T>();
+  const auto *y_data = y->data<T>();
+  const auto *dy_data = dy->data<T>();
+
+  auto place = dev_ctx.GetPlace();
+  auto *dx_data = dev_ctx.template Alloc<T>(dx);
+
+  // deal with complex
+  const T *x_data_deal;
+  const T *y_data_deal;
+  Allocator::AllocationPtr x_conj;
+  Allocator::AllocationPtr y_conj;
+  if (paddle::framework::IsComplex<T>::value) {
+    x_conj = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto *x_data_conj = reinterpret_cast<T *>(x_conj->ptr());
+    y_conj = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                 .Allocate(numel * sizeof(T));
+    auto *y_data_conj = reinterpret_cast<T *>(y_conj->ptr());
+
+    phi::funcs::ForRange<Context> for_range_x(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_x(x_data, numel, x_data_conj);
+    for_range_x(functor_x);
+
+    phi::funcs::ForRange<Context> for_range_y(dev_ctx, numel);
+    phi::funcs::ConjFunctor<T> functor_y(y_data, numel, y_data_conj);
+    for_range_y(functor_y);
+    x_data_deal = x_data_conj;
+    y_data_deal = y_data_conj;
+  } else {
+    x_data_deal = x_data;
+    y_data_deal = y_data;
+  }
+
+// Step 1: find cummax-ed zero mask of x
+#ifdef PADDLE_WITH_CUDA
+  const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
+#else
+  const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
+#endif
+  auto zero_mask_without_cummax =
+      const_cast<Allocator &>(dev_ctx.GetAllocator())
+          .Allocate(numel * sizeof(uint8_t));
+  auto *zero_mask_without_cummax_data =
+      reinterpret_cast<uint8_t *>(zero_mask_without_cummax->ptr());
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(x_data_deal),
+                    thrust::device_pointer_cast(x_data_deal) + numel,
+                    thrust::device_pointer_cast(zero_mask_without_cummax_data),
+                    funcs::IsZeroFunctor<T>());
+
+  auto zero_mask = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                       .Allocate(numel * sizeof(uint8_t));
+  auto *zero_mask_data = reinterpret_cast<uint8_t *>(zero_mask->ptr());
+  paddle::operators::math::InclusiveScan<uint8_t, cub::Max>(
+      zero_mask_without_cummax_data,
+      zero_mask_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<uint8_t>(0),
+      cub::Max(),
+      /*reverse=*/false,
+      dev_ctx);
+  zero_mask_without_cummax = nullptr;
+
+  // Step 2: calculate reversed cumsum(dy * y)
+  auto dy_mul_y = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                      .Allocate(numel * sizeof(T));
+  auto *dy_mul_y_data = reinterpret_cast<T *>(dy_mul_y->ptr());
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(dy_data),
+                    thrust::device_pointer_cast(dy_data) + numel,
+                    thrust::device_pointer_cast(y_data_deal),
+                    thrust::device_pointer_cast(dy_mul_y_data),
+                    funcs::MultiplyFunctor<T>());
+
+  auto dy_mul_y_reversed_cumsum =
+      const_cast<Allocator &>(dev_ctx.GetAllocator())
+          .Allocate(numel * sizeof(T));
+  auto *dy_mul_y_reversed_cumsum_data =
+      reinterpret_cast<T *>(dy_mul_y_reversed_cumsum->ptr());
+  paddle::operators::math::InclusiveScan<T, cub::Sum>(
+      dy_mul_y_data,
+      dy_mul_y_reversed_cumsum_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(0),
+      cub::Sum(),
+      /*reverse=*/true,
+      dev_ctx);
+
+  // Step 3: calculate the gradient value except the first zero position.
+  // The gradient value of the first zero position is filled with out[idx-1],
+  // while the gradient value of the other positions are calculated out
+  // completely. This functor also:
+  //  (1) find the first zero index, i.e., first_zero_idx_data.
+  //  (2) fill x_filled_one, which satifies
+  //      x_filled_one[i] = x[i], i > pos
+  //      x_filled_one[i] = 1, i <= pos
+  auto first_zero_idx = const_cast<Allocator &>(dev_ctx.GetAllocator())
+                            .Allocate(numel * sizeof(int64_t));
+  auto *first_zero_idx_data =
+      reinterpret_cast<int64_t *>(first_zero_idx->ptr());
+  auto *x_filled_one_data = dy_mul_y_data;  // reuse former allocated memory
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  CumprodGradFunctorExceptFirstZero<T> functor_except_first_zero(
+      x_data_deal,
+      y_data_deal,
+      dy_mul_y_reversed_cumsum_data,
+      zero_mask_data,
+      mid_dim,
+      inner_dim,
+      dx_data,
+      first_zero_idx_data,
+      x_filled_one_data);
+  for_range(functor_except_first_zero);
+
+  // Step 4: calculate cumprod of x_filled_one
+  auto *x_filled_one_cumprod_data =
+      dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
+  paddle::operators::math::InclusiveScan<T, funcs::MultiplyFunctor<T>>(
+      x_filled_one_data,
+      x_filled_one_cumprod_data,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(1),
+      funcs::MultiplyFunctor<T>(),
+      /*reverse=*/false,
+      dev_ctx);
+
+  // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod)
+  auto *dy_mul_x_filled_one_cumprod =
+      dy_mul_y_data;  // reuse former allocated memory
+  thrust::transform(exec_policy,
+                    thrust::device_pointer_cast(dy_data),
+                    thrust::device_pointer_cast(dy_data) + numel,
+                    thrust::device_pointer_cast(x_filled_one_cumprod_data),
+                    thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod),
+                    funcs::MultiplyFunctor<T>());
+  auto *dy_mul_x_filled_one_cumprod_reversed_cumsum =
+      dy_mul_y_reversed_cumsum_data;  // reuse former allocated memory
+  paddle::operators::math::InclusiveScan<T, cub::Sum>(
+      dy_mul_x_filled_one_cumprod,
+      dy_mul_x_filled_one_cumprod_reversed_cumsum,
+      outer_dim,
+      mid_dim,
+      inner_dim,
+      static_cast<T>(0),
+      cub::Sum(),
+      /*reverse=*/true,
+      dev_ctx);
+
+  // Step 6: fill zero pos gradient value
+  phi::funcs::ForRange<Context> for_range_fill_zero_pos_grad(
+      dev_ctx, outer_dim * inner_dim);
+  FillFirstZeroPositionGradFunctor<T> fill_first_zero_pos_grad_functor(
+      first_zero_idx_data,
+      dy_mul_x_filled_one_cumprod_reversed_cumsum,
+      mid_dim,
+      inner_dim,
+      dx_data);
+  for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumprodGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/cumprod_kernel.cu b/paddle/phi/kernels/gpu/cumprod_kernel.cu
new file mode 100644
index 0000000000000..1bbf8972a2479
--- /dev/null
+++ b/paddle/phi/kernels/gpu/cumprod_kernel.cu
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cumprod_kernel.h"
+
+#include "paddle/fluid/operators/math/inclusive_scan.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/cumprod.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CumprodKernel(const Context &dev_ctx,
+                   const DenseTensor &input,
+                   int dim,
+                   DenseTensor *out) {
+  const auto *x = &input;
+  auto *y = out;
+  size_t outer_dim, mid_dim, inner_dim;
+  GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim);
+
+  const auto *x_data = x->data<T>();
+  auto *y_data = dev_ctx.template Alloc<T>(y);
+  paddle::operators::math::InclusiveScan(x_data,
+                                         y_data,
+                                         outer_dim,
+                                         mid_dim,
+                                         inner_dim,
+                                         static_cast<T>(1),
+                                         funcs::MultiplyFunctor<T>(),
+                                         /*reverse=*/false,
+                                         dev_ctx);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumprod,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CumprodKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
new file mode 100644
index 0000000000000..1db6e1b7cf733
--- /dev/null
+++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/deformable_conv_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void ModulatedDeformableIm2colGpuKernel(
+    const int nthreads,
+    const T* data_im,
+    const T* data_offset,
+    const T* data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    T* data_col) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    const int w_col = i % width_col;
+    const int h_col = (i / width_col) % height_col;
+    const int b_col = (i / width_col) / height_col % batch_size;
+    const int c_im = (i / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T* data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T* data_offset_ptr =
+        data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const T* data_mask_ptr =
+        data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          val =
+              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ModulatedDeformableIm2col(const Context& dev_ctx,
+                               const T* data_im,
+                               const T* data_offset,
+                               const T* data_mask,
+                               const std::vector<int64_t>& im_shape,
+                               const std::vector<int64_t>& col_shape,
+                               const std::vector<int64_t>& filter_shape,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& dilations,
+                               const int deformable_groups,
+                               T* data_col) {
+  int channel_per_deformable_group = im_shape[0] / deformable_groups;
+  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
+
+  int blocks = NumBlocks(num_kernels);
+  int threads = kNumCUDAThreads;
+
+  ModulatedDeformableIm2colGpuKernel<
+      T><<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
+                                                   data_im,
+                                                   data_offset,
+                                                   data_mask,
+                                                   im_shape[1],
+                                                   im_shape[2],
+                                                   filter_shape[2],
+                                                   filter_shape[3],
+                                                   paddings[0],
+                                                   paddings[1],
+                                                   strides[0],
+                                                   strides[1],
+                                                   dilations[0],
+                                                   dilations[1],
+                                                   channel_per_deformable_group,
+                                                   col_shape[1],
+                                                   im_shape[0],
+                                                   deformable_groups,
+                                                   col_shape[2],
+                                                   col_shape[3],
+                                                   data_col);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(deformable_conv,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DeformableConvKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
new file mode 100644
index 0000000000000..cce12a87fac72
--- /dev/null
+++ b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(determinant_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DeterminantGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu
new file mode 100644
index 0000000000000..2518408387395
--- /dev/null
+++ b/paddle/phi/kernels/gpu/determinant_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/determinant_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    determinant, GPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_kernel.cu b/paddle/phi/kernels/gpu/elementwise_kernel.cu
index 2cffc68fa0648..a57d89013f921 100644
--- a/paddle/phi/kernels/gpu/elementwise_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu
@@ -13,9 +13,50 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 
+namespace phi {
+
+#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
+  template <typename T, typename Context>                            \
+  void name##RawKernel(const Context& dev_ctx,                       \
+                       const DenseTensor& x,                         \
+                       const DenseTensor& y,                         \
+                       int axis,                                     \
+                       DenseTensor* out) {                           \
+    std::vector<const DenseTensor*> inputs;                          \
+    std::vector<DenseTensor*> outputs;                               \
+    inputs.emplace_back(&x);                                         \
+    inputs.emplace_back(&y);                                         \
+    outputs.emplace_back(out);                                       \
+    dev_ctx.template Alloc<T>(out);                                  \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
+        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
+  }
+
+/**
+ * Kernels
+ */
+
+// Create the definition of Add
+DEFINE_CUDA_ELEMENTWISE_OP(Add)
+// Create the definition of Subtract
+DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
+// Create the definition of Multiply
+DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
+// Create the definition of Divide
+DEFINE_CUDA_ELEMENTWISE_OP(Divide)
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
 PD_REGISTER_KERNEL(elementwise_fmax,
                    GPU,
                    ALL_LAYOUT,
@@ -33,3 +74,55 @@ PD_REGISTER_KERNEL(elementwise_fmin,
                    double,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(add_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(subtract_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(divide_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(multiply_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   float16,
+                   complex64,
+                   complex128,
+                   bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
new file mode 100644
index 0000000000000..a970348760c18
--- /dev/null
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -0,0 +1,258 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/embedding_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+template <typename InT, typename OutT>
+__global__ void InputTypeConvert(const InT* in_ids,
+                                 const int64_t K,
+                                 OutT* out_ids) {
+  for (int i = 0; i < K; i++) {
+    out_ids[i] = static_cast<OutT>(in_ids[i]);
+  }
+}
+
+template <typename T, typename IdT>
+__global__ void EmbeddingGrad(T* table,
+                              const T* output,
+                              const IdT* ids,
+                              const int64_t N,
+                              const int64_t K,
+                              const int64_t D) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * gridDim.x;
+
+  while (idy < K) {
+    auto id = static_cast<int64_t>(ids[idy]);
+    const T* out = output + idy * D;
+    T* tab = table + id * D;
+#ifdef PADDLE_WITH_CUDA
+    paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
+#else
+    for (int i = idx; i < D; i += blockDim.x) {
+      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
+    }
+#endif
+    idy += blockDim.y * gridDim.x;
+  }
+}
+
+template <typename T, typename Context>
+struct EmbeddingGradCUDAFunctor {
+  EmbeddingGradCUDAFunctor(const Context& dev_ctx,
+                           const DenseTensor& input,
+                           const DenseTensor& weight,
+                           const DenseTensor& out_grad,
+                           int64_t padding_idx,
+                           DenseTensor* weight_grad)
+      : dev_ctx_(dev_ctx),
+        input_(input),
+        weight_(weight),
+        out_grad_(out_grad),
+        padding_idx_(padding_idx),
+        weight_grad_(weight_grad) {}
+
+  template <typename IdT>
+  void apply() {
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+    {
+      auto d_output_t = out_grad_;
+      auto d_table_t = weight_grad_;
+
+      int N = weight_grad_->dims()[0];
+      int D = weight_grad_->dims()[1];
+      int K = input_.numel();
+
+      const T* d_output = d_output_t.template data<T>();
+      const auto* ids = input_.template data<IdT>();
+      T* d_table = dev_ctx_.template Alloc<T>(d_table_t);
+
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
+#endif
+
+      const int gridx = 2 * dev_ctx_.GetSMCount();
+      dim3 threads(128, 8);
+      dim3 grids(gridx, 1);
+      EmbeddingGrad<T, IdT><<<grids, threads, 0, dev_ctx_.stream()>>>(
+          d_table, d_output, ids, N, K, D);
+    }
+  }
+
+ private:
+  const phi::GPUContext& dev_ctx_;
+  const DenseTensor& input_;
+  const DenseTensor& weight_;
+  const DenseTensor& out_grad_;
+  int64_t padding_idx_;
+  DenseTensor* weight_grad_;
+};
+
+template <typename T, typename Context>
+void EmbeddingGradKernel(const Context& ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& weight,
+                         const DenseTensor& out_grad,
+                         int64_t padding_idx,
+                         DenseTensor* weight_grad) {
+  EmbeddingGradCUDAFunctor<T, Context> functor(
+      ctx, input, weight, out_grad, padding_idx, weight_grad);
+
+  if (input.dtype() == phi::DataType::INT32) {
+    functor.template apply<int>();
+  } else if (input.dtype() == phi::DataType::INT64) {
+    functor.template apply<int64_t>();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "emebdding input only support int32 and int64"));
+  }
+}
+
+template <typename T, typename Context>
+struct EmbeddingSparseGradCUDAFunctor {
+  EmbeddingSparseGradCUDAFunctor(const Context& dev_ctx,
+                                 const DenseTensor& input,
+                                 const DenseTensor& weight,
+                                 const DenseTensor& out_grad,
+                                 int64_t padding_idx,
+                                 SelectedRows* weight_grad)
+      : dev_ctx_(dev_ctx),
+        input_(input),
+        weight_(weight),
+        out_grad_(out_grad),
+        padding_idx_(padding_idx),
+        weight_grad_(weight_grad) {}
+
+  template <typename IdT>
+  void apply() {
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+
+    const auto* ids_data = input_.template data<IdT>();
+    auto* d_table = weight_grad_;
+    auto* table = &weight_;
+    auto* d_output = &out_grad_;
+    int64_t ids_num = input_.numel();
+    dim3 threads(128, 8);
+    dim3 grids(8, 1);
+    auto stream = dev_ctx_.stream();
+    paddle::framework::Vector<int64_t> new_rows;
+    new_rows.resize(ids_num);
+    auto gpu_place = dev_ctx_.GetPlace();
+
+    paddle::framework::MixVector<int64_t> mixv_new_rows(&new_rows);
+    if (!std::is_same<IdT, int64_t>::value) {
+      InputTypeConvert<<<grids, threads, 0, stream>>>(
+          ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
+    } else {
+      paddle::memory::Copy(gpu_place,
+                           mixv_new_rows.CUDAMutableData(gpu_place),
+                           gpu_place,
+                           ids_data,
+                           ids_num * sizeof(int64_t),
+                           stream);
+    }
+
+    mixv_new_rows.CopyToCPU();
+    d_table->set_rows(new_rows);
+
+    auto* d_table_value = d_table->mutable_value();
+    d_table_value->Resize({ids_num, table->dims()[1]});
+    dev_ctx_.template Alloc<T>(d_table_value);
+
+    auto* d_table_data = d_table_value->template data<T>();
+    auto* d_output_data = d_output->template data<T>();
+    auto d_output_dims = d_output->dims();
+    auto d_output_dims_2d =
+        phi::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
+    PADDLE_ENFORCE_EQ(d_table_value->dims(),
+                      d_output_dims_2d,
+                      phi::errors::InvalidArgument(
+                          "ShapeError: The shape of lookup_table@Grad and "
+                          "output@Grad should be same. "
+                          "But received lookup_table@Grad's shape = [%s], "
+                          "output@Grad's shape = [%s].",
+                          d_table_value->dims(),
+                          d_output_dims_2d));
+    paddle::memory::Copy(gpu_place,
+                         d_table_data,
+                         gpu_place,
+                         d_output_data,
+                         d_output->numel() * sizeof(T),
+                         stream);
+  }
+
+ private:
+  const phi::GPUContext& dev_ctx_;
+  const DenseTensor& input_;
+  const DenseTensor& weight_;
+  const DenseTensor& out_grad_;
+  int64_t padding_idx_;
+  SelectedRows* weight_grad_;
+};
+
+template <typename T, typename Context>
+void EmbeddingSparseGradKernel(const Context& ctx,
+                               const DenseTensor& input,
+                               const DenseTensor& weight,
+                               const DenseTensor& out_grad,
+                               int64_t padding_idx,
+                               SelectedRows* weight_grad) {
+  EmbeddingSparseGradCUDAFunctor<T, Context> functor(
+      ctx, input, weight, out_grad, padding_idx, weight_grad);
+
+  if (input.dtype() == phi::DataType::INT32) {
+    functor.template apply<int>();
+  } else if (input.dtype() == phi::DataType::INT64) {
+    functor.template apply<int64_t>();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "emebdding input only support int32 and int64"));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(embedding_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EmbeddingGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(embedding_sparse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EmbeddingSparseGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/embedding_kernel.cu b/paddle/phi/kernels/gpu/embedding_kernel.cu
new file mode 100644
index 0000000000000..7f3a31ba544d8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/embedding_kernel.cu
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/embedding_kernel.h"
+#include "paddle/phi/kernels/funcs/embedding_util.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename T, typename IdT, bool PaddingFlag>
+__global__ void EmbeddingFW(T *output,
+                            const T *table,
+                            const IdT *ids,
+                            const int64_t N,
+                            const int64_t K,
+                            const int64_t D,
+                            const int64_t padding_idx) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * gridDim.x;
+
+  while (idy < K) {
+    auto id = static_cast<int64_t>(ids[idy]);
+    T *out = output + idy * D;
+    const T *tab = table + id * D;
+    for (int i = idx; i < D; i += blockDim.x) {
+      if (PaddingFlag) {
+        if (id == padding_idx)
+          out[i] = static_cast<T>(0);
+        else
+          out[i] = tab[i];
+      } else {
+        out[i] = tab[i];
+      }
+    }
+    idy += blockDim.y * gridDim.x;
+  }
+}
+
+template <typename T, typename Context>
+struct EmbeddingCUDAFunctor {
+  EmbeddingCUDAFunctor(const Context &dev_ctx,
+                       const DenseTensor &input,
+                       const DenseTensor &weight,
+                       int64_t padding_idx,
+                       DenseTensor *out)
+      : dev_ctx_(dev_ctx),
+        input_(input),
+        weight_(weight),
+        out_(out),
+        padding_idx_(padding_idx) {}
+
+  template <typename IdT>
+  void apply() {
+    size_t N = weight_.dims()[0];
+    size_t D = weight_.dims()[1];
+    size_t K = input_.numel();
+
+    const int gridx = 2 * dev_ctx_.GetSMCount();
+    dim3 threads(256, 4);
+    dim3 grids(gridx, 1);
+
+    const T *table = weight_.template data<T>();
+    const IdT *ids = input_.template data<IdT>();
+    auto *output = dev_ctx_.template Alloc<T>(out_);
+    auto stream = dev_ctx_.stream();
+
+    if (padding_idx_ == -1) {
+      EmbeddingFW<T, IdT, false><<<grids, threads, 0, stream>>>(
+          output, table, ids, N, K, D, padding_idx_);
+    } else {
+      EmbeddingFW<T, IdT, true><<<grids, threads, 0, stream>>>(
+          output, table, ids, N, K, D, padding_idx_);
+    }
+  }
+
+ private:
+  const phi::GPUContext &dev_ctx_;
+  const DenseTensor &input_;
+  const DenseTensor &weight_;
+  DenseTensor *out_;
+  int64_t padding_idx_;
+};
+
+template <typename T, typename Context>
+void EmbeddingKernel(const Context &ctx,
+                     const DenseTensor &input,
+                     const DenseTensor &weight,
+                     int64_t padding_idx,
+                     DenseTensor *out) {
+  EmbeddingCUDAFunctor<T, Context> functor(
+      ctx, input, weight, padding_idx, out);
+
+  if (input.dtype() == phi::DataType::INT32) {
+    functor.template apply<int32_t>();
+  } else if (input.dtype() == phi::DataType::INT64) {
+    functor.template apply<int64_t>();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "emebdding input only support int32 and int64"));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(embedding,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EmbeddingKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu
new file mode 100644
index 0000000000000..221bf1cb4c68c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/frobenius_norm_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(frobenius_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FrobeniusNormGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
new file mode 100644
index 0000000000000..012237165b739
--- /dev/null
+++ b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/frobenius_norm_kernel.h"
+#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(
+    frobenius_norm, GPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/gather_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_grad_kernel.cu
new file mode 100644
index 0000000000000..04149a2f9ee41
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_grad_kernel.cu
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& index,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis,
+                      bool overwrite,
+                      DenseTensor* x_grad) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+
+  if (axis_v != 0) {
+    if (index_type == DataType::INT32) {
+      phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(
+          &out_grad, &index, axis_v, x_grad, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::GatherV2GradCUDAFunction<T, int64_t>(
+          &out_grad, &index, axis_v, x_grad, dev_ctx);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(x_grad);
+  auto dxt = EigenVector<T>::Flatten(*x_grad);
+  auto& place = *dev_ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+  if (index_type == DataType::INT32) {
+    phi::funcs::GPUScatterAssign<T, int>(
+        dev_ctx, out_grad, index, x_grad, overwrite);
+  } else if (index_type == DataType::INT64) {
+    phi::funcs::GPUScatterAssign<T, int64_t>(
+        dev_ctx, out_grad, index, x_grad, overwrite);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherGradKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/gather_kernel.cu b/paddle/phi/kernels/gpu/gather_kernel.cu
new file mode 100644
index 0000000000000..7e0c6cc168564
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_kernel.cu
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_kernel.h"
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& index,
+                  const Scalar& axis,
+                  DenseTensor* out) {
+  const auto& index_type = index.dtype();
+  auto axis_v = axis.to<int>();
+  if (axis_v != 0) {
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GatherV2CUDAFunction<T, int32_t>(
+          &x, &index, axis_v, out, dev_ctx);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GatherV2CUDAFunction<T, int64_t>(
+          &x, &index, axis_v, out, dev_ctx);
+    } else if (index_type == phi::DataType::INT16) {
+      phi::funcs::GatherV2CUDAFunction<T, int16_t>(
+          &x, &index, axis_v, out, dev_ctx);
+    }
+    return;
+  }
+
+  dev_ctx.template Alloc<T>(out);
+
+  if (x.numel() == 0) return;
+  if (index_type == phi::DataType::INT32) {
+    phi::funcs::GPUGather<T, int>(dev_ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT64) {
+    phi::funcs::GPUGather<T, int64_t>(dev_ctx, x, index, out);
+  } else if (index_type == phi::DataType::INT16) {
+    phi::funcs::GPUGather<T, int16_t>(dev_ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gather,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GatherKernel,
+                   float,
+                   double,
+                   int64_t,
+                   int,
+                   int16_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
new file mode 100644
index 0000000000000..2b9be7c615435
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -0,0 +1,176 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+DECLARE_bool(use_fast_math);
+
+namespace phi {
+
+#ifdef __NVCC__
+template <bool FastMode>
+static __device__ __forceinline__ float FP32FastTanh(float x) {
+#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000
+  if (FastMode) {
+    float y;
+    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x));
+    return y;
+  }
+#endif
+  return tanhf(x);
+}
+
+template <bool FastMode>
+static __device__ __forceinline__ float FP32GeluFwd(float x) {
+  auto tanh_out =
+      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
+  return x * 0.5f * (1.0f + tanh_out);
+}
+
+template <bool FastMode>
+static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) {
+  auto tanh_out =
+      FP32FastTanh<FastMode>(0.79788456f * x * (1.0f + 0.044715f * x * x));
+  auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) *
+                         (0.79788456f + 0.1070322243f * x * x)) +
+             0.5f * (1.0f + tanh_out);
+  return tmp * y_g;
+}
+
+template <int VecSize, bool FastMode>
+static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x,
+                                                 __half* y,
+                                                 size_t n) {
+  size_t offset =
+      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
+  for (; offset < n; offset += stride) {
+    using ArrT = phi::AlignedVector<__half, VecSize>;
+    ArrT in_arr = *reinterpret_cast<const ArrT*>(x + offset);
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      float tmp = __half2float(in_arr[i]);
+      in_arr[i] = __float2half(FP32GeluFwd<FastMode>(tmp));
+    }
+    *reinterpret_cast<ArrT*>(y + offset) = in_arr;
+  }
+}
+
+template <int VecSize, bool FastMode>
+static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x,
+                                                 const __half* y_g,
+                                                 __half* x_g,
+                                                 size_t n) {
+  size_t offset =
+      static_cast<size_t>(threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  size_t stride = static_cast<size_t>(blockDim.x * gridDim.x) * VecSize;
+  for (; offset < n; offset += stride) {
+    using ArrT = phi::AlignedVector<__half, VecSize>;
+    ArrT x_in_arr = *reinterpret_cast<const ArrT*>(x + offset);
+    ArrT y_g_in_arr = *reinterpret_cast<const ArrT*>(y_g + offset);
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      __half2 tmp_fp16_2;
+      tmp_fp16_2.x = x_in_arr[i];
+      tmp_fp16_2.y = y_g_in_arr[i];
+      float2 tmp_fp32_2 = __half22float2(tmp_fp16_2);
+      x_in_arr[i] =
+          __float2half(FP32GeluBwd<FastMode>(tmp_fp32_2.x, tmp_fp32_2.y));
+    }
+    *reinterpret_cast<ArrT*>(x_g + offset) = x_in_arr;
+  }
+}
+
+static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
+    const GPUContext& dev_ctx, const __half* x, __half* y, size_t n) {
+  auto is_aligned = [](const void* p, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
+  };
+
+#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math)      \
+  do {                                                                        \
+    constexpr auto kAlignment =                                               \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
+    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
+        is_aligned(y, kAlignment)) {                                          \
+      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
+      size_t block = (n / __vec_size + thread - 1) / thread;                  \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
+      VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block          \
+               << " , thread = " << thread;                                   \
+      FP16FastGeluFwdCUDAKernel<                                              \
+          __vec_size,                                                         \
+          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(x, y, n);  \
+      return true;                                                            \
+    }                                                                         \
+  } while (0)
+
+  if (FLAGS_use_fast_math) {
+    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true);
+  } else {
+    PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false);
+  }
+
+#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL
+  return false;
+}
+
+static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
+    const GPUContext& dev_ctx,
+    const __half* x,
+    const __half* y_g,
+    __half* x_g,
+    size_t n) {
+  auto is_aligned = [](const void* p, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(p) % alignment == 0;
+  };
+
+#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math)      \
+  do {                                                                        \
+    constexpr auto kAlignment =                                               \
+        alignof(phi::AlignedVector<__half, __vec_size>);                      \
+    if (n % __vec_size == 0 && is_aligned(x, kAlignment) &&                   \
+        is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) &&           \
+        is_aligned(x_g, kAlignment)) {                                        \
+      size_t thread = std::min<size_t>(512, dev_ctx.GetMaxThreadsPerBlock()); \
+      size_t block = (n / __vec_size + thread - 1) / thread;                  \
+      block = std::min<size_t>(block, dev_ctx.GetCUDAMaxGridDimSize()[0]);    \
+      VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block          \
+               << " , thread = " << thread;                                   \
+      FP16FastGeluBwdCUDAKernel<                                              \
+          __vec_size,                                                         \
+          __use_fast_math><<<block, thread, 0, dev_ctx.stream()>>>(           \
+          x, y_g, x_g, n);                                                    \
+      return true;                                                            \
+    }                                                                         \
+  } while (0)
+
+  if (FLAGS_use_fast_math) {
+    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true);
+  } else {
+    PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false);
+  }
+
+#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL
+  return false;
+}
+#endif
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
new file mode 100644
index 0000000000000..1e21f8d4267bc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/gpu/gelu_funcs.h"
+
+DECLARE_bool(use_fast_math);
+
+namespace phi {
+
+template <typename T>
+struct GeluWithApproximateGradFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType one = static_cast<MPType>(1);
+    MPType half = static_cast<MPType>(0.5);
+    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
+    MPType kBeta =
+        kAlpha * static_cast<MPType>(GELU_CONSTANT) * static_cast<MPType>(3);
+    auto cube_x = x * x * x;
+    auto tanh_out =
+        tanh(kAlpha * ((static_cast<MPType>(GELU_CONSTANT) * cube_x) + x));
+    auto ans =
+        half * (one + tanh_out +
+                (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x));
+    return static_cast<T>(ans * dout);
+  }
+};
+
+template <typename T>
+struct GeluWithoutApproximateGradFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType dout = static_cast<MPType>(arg_dout);
+    constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast<MPType>(0.5);
+    const MPType cdf = normcdf(x);
+    const MPType pdf = exp(static_cast<MPType>(-0.5) * x * x) * kBeta;
+    return static_cast<T>(dout * (cdf + x * pdf));
+  }
+};
+
+template <typename T, typename Context>
+void GeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    bool approximate,
+                    DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  std::vector<const DenseTensor*> ins = {&x, &out_grad};
+  std::vector<DenseTensor*> outs = {x_grad};
+  if (approximate) {
+#ifdef __NVCC__
+    if (std::is_same<T, dtype::float16>::value) {
+      size_t n = x.numel();
+      const auto* x_ptr = reinterpret_cast<const __half*>(x.data<T>());
+      const auto* y_g_ptr = reinterpret_cast<const __half*>(out_grad.data<T>());
+      auto* x_g_ptr = reinterpret_cast<__half*>(x_grad->data<T>());
+      if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(
+              dev_ctx, x_ptr, y_g_ptr, x_g_ptr, n)) {
+        return;
+      }
+    }
+#endif
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor<T>());
+  } else {
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor<T>());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gelu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GeluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu
new file mode 100644
index 0000000000000..ce6dda2d6cc65
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gelu_kernel.cu
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gelu_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/gpu/gelu_funcs.h"
+
+DECLARE_bool(use_fast_math);
+
+namespace phi {
+
+template <typename T>
+struct GeluWithApproximateFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x) {
+    // this function is tanh approximation of gelu
+    MPType x = static_cast<MPType>(arg_x);
+    MPType one = static_cast<MPType>(1);
+    MPType half = static_cast<MPType>(0.5);
+    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
+    auto tanh_out =
+        tanh(kAlpha * x * (one + static_cast<MPType>(GELU_CONSTANT) * x * x));
+    MPType out = x * half * (one + tanh_out);
+    return static_cast<T>(out);
+  }
+};
+
+template <typename T>
+struct GeluWithoutApproximateFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x) {
+    // actual gelu with approximation = false
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(x * normcdf(x));
+  }
+};
+
+template <typename T, typename Context>
+void GeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                bool approximate,
+                DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  if (approximate) {
+#ifdef __NVCC__
+    if (std::is_same<T, dtype::float16>::value) {
+      size_t n = x.numel();
+      const auto* in_ptr = reinterpret_cast<const __half*>(x.data<T>());
+      auto* out_ptr = reinterpret_cast<__half*>(out->data<T>());
+      if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(
+              dev_ctx, in_ptr, out_ptr, n)) {
+        return;
+      }
+    }
+#endif
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
+  } else {
+    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor<T>());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gelu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GeluKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
index 75692966b4662..8bd3337280d75 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -28,19 +28,19 @@ template <typename Context, typename T, typename IndexT>
 void GraphSendRecvGradOpCUDAKernelLaunchHelper(
     const Context& ctx,
     const DenseTensor& out_grad,
+    const DenseTensor& x,
     const DenseTensor& src_index,
     const DenseTensor& dst_index,
     const std::string& pool_type,
     DenseTensor* x_grad,
     const DenseTensor* dst_count = nullptr,
-    const DenseTensor* x = nullptr,
     const DenseTensor* out = nullptr) {
   const int& index_size = dst_index.dims()[0];
 
   ctx.template Alloc<T>(x_grad);
   T* p_output = x_grad->data<T>();
 
-  const auto& src_dims = out_grad.dims();
+  const auto& src_dims = x.dims();
   int64_t memset_size = 1;
   for (int i = 0; i < src_dims.size(); ++i) {
     memset_size *= src_dims[i];
@@ -86,7 +86,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
     ManipulateMeanGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
         p_src, d_index, s_index, p_output, index_size, slice_size, s_count);
   } else if (pool_type == "MAX" || pool_type == "MIN") {
-    const T* ptr_input = x->data<T>();
+    const T* ptr_input = x.data<T>();
     const T* ptr_output = out->data<T>();
     ManipulateMinMaxGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
         p_src,
@@ -103,7 +103,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
 template <typename T, typename Context>
 void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& out_grad,
-                             paddle::optional<const DenseTensor&> x,
+                             const DenseTensor& x,
                              paddle::optional<const DenseTensor&> out,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
@@ -115,23 +115,23 @@ void GraphSendRecvGradKernel(const Context& ctx,
     GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int32_t>(
         ctx,
         out_grad,
+        x,
         src_index,
         dst_index,
         pool_type,
         x_grad,
         dst_count.get_ptr(),
-        x.get_ptr(),
         out.get_ptr());
   } else if (index_type == phi::DataType::INT64) {
     GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int64_t>(
         ctx,
         out_grad,
+        x,
         src_index,
         dst_index,
         pool_type,
         x_grad,
         dst_count.get_ptr(),
-        x.get_ptr(),
         out.get_ptr());
   }
 }
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
index fab306f831a6f..2826c071d6ec3 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
@@ -32,6 +32,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
                                            const DenseTensor& src_index,
                                            const DenseTensor& dst_index,
                                            const std::string& pool_type,
+                                           int64_t out_size,
                                            DenseTensor* out,
                                            DenseTensor* dst_count = nullptr) {
   const int& index_size = src_index.dims()[0];
@@ -39,8 +40,15 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
   T* p_output = out->data<T>();
   const auto& src_dims = x.dims();
   int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) {
-    memset_size *= src_dims[i];
+  if (out_size <= 0) {
+    for (int i = 0; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
+  } else {
+    memset_size = out_size;
+    for (int i = 1; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
   }
   const size_t& memset_bytes = memset_size * sizeof(T);
   if (pool_type == "SUM" || pool_type == "MEAN") {
@@ -100,6 +108,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
                                     IndexT>><<<grid, block, 0, ctx.stream()>>>(
         p_src, s_index, d_index, p_output, index_size, slice_size, functor);
 
+    if (out_size > 0) {
+      input_size = out_size;
+    }
     int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
     int64_t grid_max =
         grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
@@ -114,6 +125,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
                                     IndexT>><<<grid, block, 0, ctx.stream()>>>(
         p_src, s_index, d_index, p_output, index_size, slice_size, functor);
 
+    if (out_size > 0) {
+      input_size = out_size;
+    }
     int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
     int64_t grid_min =
         grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
@@ -130,6 +144,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
 
     ctx.template Alloc<int32_t>(dst_count);
     int32_t* p_dst_count = dst_count->data<int32_t>();
+    if (out_size > 0) {
+      input_size = out_size;
+    }
 
 #ifdef PADDLE_WITH_HIP
     hipMemset(p_dst_count, 0, input_size * sizeof(int));
@@ -155,15 +172,16 @@ void GraphSendRecvKernel(const Context& ctx,
                          const DenseTensor& src_index,
                          const DenseTensor& dst_index,
                          const std::string& pool_type,
+                         int64_t out_size,
                          DenseTensor* out,
                          DenseTensor* dst_count) {
   auto index_type = src_index.dtype();
   if (index_type == phi::DataType::INT32) {
     GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int32_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
   } else if (index_type == phi::DataType::INT64) {
     GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int64_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
new file mode 100644
index 0000000000000..457a348be832b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
@@ -0,0 +1,324 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+template <typename T>
+static __forceinline__ __device__ void AtomicAdd(
+    T* data, int h, int w, int sH, int sW, int H, int W, T delta) {
+  if (InBounds(h, w, H, W)) {
+    paddle::platform::CudaAtomicAdd(data + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+UnnormalizeWithMask(T coord, int size, bool align_corners, T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ClipIndexesWithMask(T in,
+                                                        int clip_limit,
+                                                        T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+ReflectIndexesWithMask(T in, int twice_low, int twice_high, T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  int grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T
+ComputePositionsWithMask(T coord,
+                         int size,
+                         PaddingMode padding_mode,
+                         bool align_corners,
+                         T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = UnnormalizeWithMask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = ReflectIndexesWithMask(coord, 0, 2 * (size - 1), &grad_refl);
+    } else {
+      coord = ReflectIndexesWithMask(coord, -1, 2 * size - 1, &grad_refl);
+    }
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+
+  return coord;
+}
+
+template <typename T>
+__global__ void GridSamplerCudaBackwardKernel(const int nthreads,
+                                              const T* grad_output,
+                                              const T* input,
+                                              const T* grid,
+                                              int n,
+                                              int out_c,
+                                              int out_h,
+                                              int out_w,
+                                              int in_h,
+                                              int in_w,
+                                              T* grad_input,
+                                              T* grad_grid,
+                                              const Mode mode,
+                                              const PaddingMode padding_mode,
+                                              bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+
+  int gOut_sN = out_c * out_h * out_w;
+  int gOut_sC = out_h * out_w;
+  int gOut_sH = out_w;
+  int gOut_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    T gix_mult, giy_mult;
+    ix = ComputePositionsWithMask(
+        ix, in_w, padding_mode, align_corners, &gix_mult);
+    iy = ComputePositionsWithMask(
+        iy, in_h, padding_mode, align_corners, &giy_mult);
+
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0);
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      int inp_offset_NC = n * inp_sN;
+      for (int c = 0; c < out_c; ++c,
+               inp_offset_NC += inp_sC,
+               gInp_ptr_NC += inp_sC,
+               gOut_offset += gOut_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        AtomicAdd(
+            gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut);
+
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = gix_mult * gix;
+        gGrid_ptr_NHW[1] = giy_mult * giy;
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(std::nearbyint(ix));
+      int iy_nearest = static_cast<int>(std::nearbyint(iy));
+
+      int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (int c = 0; c < out_c;
+           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        AtomicAdd(gInp_ptr_NC,
+                  iy_nearest,
+                  ix_nearest,
+                  inp_sH,
+                  inp_sW,
+                  in_h,
+                  in_w,
+                  grad_output[gOut_offset]);
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = static_cast<T>(0);
+        gGrid_ptr_NHW[1] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grad,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+  T* grid_grad_data = nullptr;
+  if (grid_grad != nullptr) {
+    grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+  }
+
+  int count = static_cast<int>(n * out_h * out_w);
+  auto cu_stream = dev_ctx.stream();
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+  GridSamplerCudaBackwardKernel<
+      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+      count,
+      out_grad.data<T>(),
+      x.data<T>(),
+      grid.data<T>(),
+      n,
+      c,
+      out_h,
+      out_w,
+      in_h,
+      in_w,
+      x_grad->data<T>(),
+      grid_grad_data,
+      enum_mode,
+      enum_padding_mode,
+      align_corners);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(grid_sample_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GridSampleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/grid_sample_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
new file mode 100644
index 0000000000000..f611b46911c4f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
@@ -0,0 +1,233 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+
+namespace phi {
+
+template <typename T>
+static __forceinline__ __device__ T Unnormalize(T coord,
+                                                int size,
+                                                bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ClipIndexes(T in, int max_value) {
+  return min(static_cast<T>(max_value), max(in, static_cast<T>(0)));
+}
+
+template <typename T>
+static __forceinline__ __device__ T ReflectIndexes(T in,
+                                                   int twice_low,
+                                                   int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  int flips = static_cast<int>(floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename T>
+static __forceinline__ __device__ T ComputePositions(T coord,
+                                                     int size,
+                                                     PaddingMode padding_mode,
+                                                     bool align_corners) {
+  coord = Unnormalize<T>(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexes(coord, size - 1);
+  } else if (padding_mode == PaddingMode::reflect) {
+    if (align_corners) {
+      coord = ReflectIndexes(coord, 0, 2 * (size - 1));
+    } else {
+      coord = ReflectIndexes(coord, -1, 2 * size - 1);
+    }
+    coord = ClipIndexes(coord, size - 1);
+  }
+  return coord;
+}
+
+template <typename T>
+__global__ void GridSampleCudaKernel(const int nthreads,
+                                     int n,
+                                     int out_c,
+                                     int out_h,
+                                     int out_w,
+                                     int in_h,
+                                     int in_w,
+                                     const T* input,
+                                     const T* grid,
+                                     T* output,
+                                     const Mode mode,
+                                     const PaddingMode padding_mode,
+                                     bool align_corners) {
+  int inp_sN = out_c * in_h * in_w;
+
+  int inp_sC = in_h * in_w;
+  int inp_sH = in_w;
+  int inp_sW = 1;
+  int grid_sN = out_h * out_w * 2;
+  int grid_sH = out_w * 2;
+  int grid_sW = 2;
+  int grid_sCoor = 1;
+  int out_sN = out_c * out_h * out_w;
+  int out_sC = out_h * out_w;
+  int out_sH = out_w;
+  int out_sW = 1;
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_w;
+    const int h = (index / out_w) % out_h;
+    const int n = index / (out_h * out_w);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    if (mode == Mode::bilinear) {
+      int ix_nw = static_cast<int>(floor(ix));
+      int iy_nw = static_cast<int>(floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      auto inp_offset_NC = n * inp_sN;
+
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<T>(0);
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          *out_ptr_NCHW +=
+              input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (mode == Mode::nearest) {
+      int ix_nearest = static_cast<int>(std::nearbyint(ix));
+      int iy_nearest = static_cast<int>(std::nearbyint(iy));
+      auto inp_offset_NC = n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) {
+          *out_ptr_NCHW =
+              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+  const int n = grid.dims()[0];
+  const int out_h = grid.dims()[1];
+  const int out_w = grid.dims()[2];
+  const int c = x.dims()[1];
+  const int in_h = x.dims()[2];
+  const int in_w = x.dims()[3];
+  VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+          << "; out_w: " << out_w;
+
+  auto* output_data = dev_ctx.template Alloc<T>(out);
+  VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+          << out->dims()[2] << "; " << out->dims()[3];
+
+  int count = static_cast<int>(n * out_h * out_w);
+  auto cu_stream = dev_ctx.stream();
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+  GridSampleCudaKernel<
+      T><<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>(
+      count,
+      n,
+      c,
+      out_h,
+      out_w,
+      in_h,
+      in_w,
+      x.data<T>(),
+      grid.data<T>(),
+      output_data,
+      enum_mode,
+      enum_padding_mode,
+      align_corners);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    grid_sample, GPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/grid_sample_utils.h b/paddle/phi/kernels/gpu/grid_sample_utils.h
new file mode 100644
index 0000000000000..098eb9defb549
--- /dev/null
+++ b/paddle/phi/kernels/gpu/grid_sample_utils.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+
+enum class Mode {
+  bilinear,
+  nearest,
+};
+
+enum class PaddingMode { zeros, border, reflect };
+
+static __forceinline__ __device__ bool InBounds(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu
index 0e042089e1e3d..68573d5596646 100644
--- a/paddle/phi/kernels/gpu/index_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -35,7 +35,7 @@ void LimitGridDim(const Context& ctx, dim3* grid_dim) {
 #define PREDEFINED_BLOCK_SIZE_X 512
 #define PREDEFINED_BLOCK_SIZE 1024
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
-}
+}  // namespace
 
 template <typename T, typename IndexT = int>
 __global__ void IndexSampleForward(const IndexT* index,
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
new file mode 100644
index 0000000000000..a393eecd51242
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_grad_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename IndexT>
+__global__ void index_select_grad_cuda_kernel(const T* output_grad,
+                                              T* input_grad,
+                                              const IndexT* index,
+                                              int64_t nums,
+                                              int64_t N,
+                                              int64_t stride,
+                                              int64_t size,
+                                              int64_t delta) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int64_t pre_idx = idx / (stride * size);
+  int64_t dim_idx = idx % (stride * size) / stride;
+  IndexT src_dim_idx = index[dim_idx];
+  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
+  paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
+}
+
+template <typename T>
+__global__ void index_select_grad_init(T* input_grad, int64_t N) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  input_grad[idx] = 0.0;
+}
+
+template <typename T, typename Context>
+void IndexSelectGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           const DenseTensor& out_grad,
+                           int dim,
+                           DenseTensor* x_grad) {
+  auto* output_grad_data = out_grad.data<T>();
+  auto* in_grad_data = ctx.template Alloc<T>(x_grad);
+
+  auto input_dim = x_grad->dims();
+  auto output_dim = out_grad.dims();
+  dim = dim >= 0 ? dim : dim + input_dim.size();
+  auto stride_dim = phi::stride(input_dim);
+  int64_t stride = stride_dim[dim];
+  int64_t size = output_dim[dim];
+  int64_t delta = input_dim[dim] - size;
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  int64_t numel = x_grad->numel();
+  int64_t index_nums = index.numel();
+  int64_t out_nums = out_grad.numel();
+
+  auto stream = ctx.stream();
+
+  index_select_grad_init<
+      T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+           PADDLE_CUDA_NUM_THREADS,
+           0,
+           stream>>>(in_grad_data, numel);
+
+  if (index_type == phi::DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    index_select_grad_cuda_kernel<T, int64_t><<<
+        (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(output_grad_data,
+                  in_grad_data,
+                  index_data,
+                  index_nums,
+                  out_nums,
+                  stride,
+                  size,
+                  delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  } else {
+    const int* index_data = index.data<int>();
+    index_select_grad_cuda_kernel<T, int><<<
+        (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(output_grad_data,
+                  in_grad_data,
+                  index_data,
+                  index_nums,
+                  out_nums,
+                  stride,
+                  size,
+                  delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu
new file mode 100644
index 0000000000000..f774522318acb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_select_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename IndexT>
+__global__ void index_select_cuda_kernel(const T* input,
+                                         T* output,
+                                         const IndexT* index,
+                                         int64_t N,
+                                         int64_t stride,
+                                         int64_t size,
+                                         int64_t delta) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int64_t pre_idx = idx / (stride * size);
+  int64_t dim_idx = idx % (stride * size) / stride;
+  IndexT src_dim_idx = index[dim_idx];
+  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
+  output[idx] = input[input_idx];
+}
+
+template <typename T, typename Context>
+void IndexSelectKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       int dim,
+                       DenseTensor* output) {
+  auto input_dim = x.dims();
+  auto output_dim = output->dims();
+  dim = dim >= 0 ? dim : dim + input_dim.size();
+  auto stride_dim = phi::stride(input_dim);
+  int64_t stride = stride_dim[dim];
+  int64_t size = output_dim[dim];
+  int64_t delta = input_dim[dim] - size;
+  const auto& index_type = index.dtype();
+
+  bool index_type_match =
+      index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32;
+  PADDLE_ENFORCE_EQ(index_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Index) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        index_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  auto* in_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(output);
+
+  int64_t numel = output->numel();
+  auto stream = ctx.stream();
+
+  if (index_type == phi::DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    index_select_cuda_kernel<T, int64_t><<<
+        (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+        PADDLE_CUDA_NUM_THREADS,
+        0,
+        stream>>>(in_data, out_data, index_data, numel, stride, size, delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  } else {
+    const int* index_data = index.data<int>();
+    index_select_cuda_kernel<
+        T,
+        int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+               PADDLE_CUDA_NUM_THREADS,
+               0,
+               stream>>>(
+        in_data, out_data, index_data, numel, stride, size, delta);
+    phi::backends::gpu::GpuStreamSync(stream);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_select,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSelectKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/isclose_kernel.cu b/paddle/phi/kernels/gpu/isclose_kernel.cu
new file mode 100644
index 0000000000000..34774ec715c48
--- /dev/null
+++ b/paddle/phi/kernels/gpu/isclose_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/isclose_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    isclose, GPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
new file mode 100644
index 0000000000000..8ca53f021f054
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    kldiv_loss_grad, GPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
new file mode 100644
index 0000000000000..9388ac7071c31
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kldiv_loss_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    kldiv_loss, GPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
new file mode 100644
index 0000000000000..f6e96046a2bd7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_grad_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+static int getBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x) {
+  const auto& in_dims = x.dims();
+  auto out_dims = indices.dims();
+  if (axis < 0) axis += in_dims.size();
+  T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+  const T* out_grad_data = d_out.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+  int pre, n, post;
+  paddle::operators::GetDims(in_dims, axis, &pre, &n, &post);
+  int block_size = getBlockSize(post * k);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+  paddle::operators::AssignGradWithAxis<
+      T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
new file mode 100644
index 0000000000000..4218e153ec29b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -0,0 +1,252 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/kthvalue_kernel.h"
+
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+inline int getBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+template <typename T>
+bool SortKthvalue(const phi::GPUContext& dev_ctx,
+                  const DenseTensor* input_tensor,
+                  const int64_t num_cols,
+                  const int64_t num_rows,
+                  const int k,
+                  DenseTensor* out_tensor,
+                  DenseTensor* indices_tensor) {
+  auto cu_stream = dev_ctx.stream();
+  DenseTensor input_indices;
+  const std::vector<int64_t> dims = {num_rows, num_cols};
+  auto dim = phi::make_ddim(dims);
+  input_indices.Resize(dim);
+  dev_ctx.template Alloc<int64_t>(&input_indices);
+  size_t temp_storage_bytes = -1;
+  int block_size = getBlockSize(num_cols);
+  unsigned int maxGridDimX = dev_ctx.GetCUDAMaxGridDimSize()[0];
+  unsigned int grid_size = num_rows < maxGridDimX
+                               ? static_cast<unsigned int>(num_rows)
+                               : maxGridDimX;
+  paddle::operators::InitIndex<
+      int64_t><<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<int64_t>(), num_rows, num_cols);
+  cub::CountingInputIterator<int64_t> counting_iter(0);
+  cub::TransformInputIterator<int64_t,
+                              paddle::operators::SegmentOffsetIter,
+                              cub::CountingInputIterator<int64_t>>
+      segment_offsets_t(counting_iter,
+                        paddle::operators::SegmentOffsetIter(num_cols));
+  T* sorted_values_ptr;
+  int64_t* sorted_indices_ptr;
+  DenseTensor temp_values, temp_indices;
+  const T* input = input_tensor->data<T>();
+  T* values = out_tensor->data<T>();
+  int64_t* indices = indices_tensor->mutable_data<int64_t>(dev_ctx.GetPlace());
+  temp_values.Resize(dim);
+  temp_indices.Resize(dim);
+  sorted_values_ptr = dev_ctx.template Alloc<T>(&temp_values);
+  sorted_indices_ptr = dev_ctx.template Alloc<int64_t>(&temp_indices);
+  auto err =
+      cub::DeviceSegmentedRadixSort::SortPairs(nullptr,
+                                               temp_storage_bytes,
+                                               input,
+                                               sorted_values_ptr,
+                                               input_indices.data<int64_t>(),
+                                               sorted_indices_ptr,
+                                               num_cols * num_rows,
+                                               num_rows,
+                                               segment_offsets_t,
+                                               segment_offsets_t + 1,
+                                               0,
+                                               sizeof(T) * 8,
+                                               cu_stream);
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << hipGetErrorString(err);
+    return false;
+  }
+#else
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << cudaGetErrorString(err);
+    return false;
+  }
+#endif
+  DenseTensor temp_storage;
+  temp_storage.Resize({static_cast<int>(temp_storage_bytes / sizeof(uint8_t))});
+  uint8_t* temp_storage_data = dev_ctx.template Alloc<uint8_t>(&temp_storage);
+
+  err = cub::DeviceSegmentedRadixSort::SortPairs(temp_storage_data,
+                                                 temp_storage_bytes,
+                                                 input,
+                                                 sorted_values_ptr,
+                                                 input_indices.data<int64_t>(),
+                                                 sorted_indices_ptr,
+                                                 num_cols * num_rows,
+                                                 num_rows,
+                                                 segment_offsets_t,
+                                                 segment_offsets_t + 1,
+                                                 0,
+                                                 sizeof(T) * 8,
+                                                 cu_stream);
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "hipcub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << hipGetErrorString(err);
+    return false;
+  }
+#else
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
+    return false;
+  }
+#endif
+  auto& dev = *dev_ctx.eigen_device();
+  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, k - 1};
+  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, 1};
+  auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
+  auto e_tmp_indices =
+      EigenMatrix<int64_t>::From(static_cast<const DenseTensor>(temp_indices));
+  std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
+  dim = phi::make_ddim(odims);
+  auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
+  auto e_tmp_values =
+      EigenMatrix<T>::From(static_cast<const DenseTensor>(temp_values));
+
+  funcs::EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
+      dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
+  funcs::EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
+      dev, e_values, e_tmp_values, slice_indices, slice_sizes);
+  return true;
+}
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* output,
+                    DenseTensor* indices) {
+  const auto& in_dims = x.dims();
+  if (axis < 0) axis += in_dims.size();
+  auto out_dims = output->dims();
+  const T* input_data = x.data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(output);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    PADDLE_ENFORCE_EQ(
+        SortKthvalue<T>(
+            dev_ctx, &x, input_width, input_height, k, output, indices),
+        true,
+        phi::errors::External("KthvalueOP: Error when use cub sorting"));
+    return;
+  } else {
+    std::vector<int> trans;
+    for (int i = 0; i < axis; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans.emplace_back(i);
+    }
+    trans.emplace_back(axis);
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
+      output->Resize(tmp_out_dims);
+      indices->Resize(tmp_out_dims);
+    }
+    DDim trans_dims(in_dims);
+    DDim trans_out_dims(in_dims);
+    for (int i = 0; i < trans.size(); i++) {
+      trans_dims[i] = in_dims[trans[i]];
+      trans_out_dims[i] = in_dims[trans[i]];
+    }
+    trans_out_dims[in_dims.size() - 1] = 1;
+    DenseTensor trans_input;
+    trans_input.mutable_data<T>(trans_dims, dev_ctx.GetPlace());
+    int ndims = trans.size();
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, x, &trans_input, trans);
+    DenseTensor trans_ind, trans_out;
+    trans_ind.mutable_data<int64_t>(trans_out_dims, dev_ctx.GetPlace());
+    trans_out.mutable_data<T>(trans_out_dims, dev_ctx.GetPlace());
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_width = trans_dims[trans_dims.size() - 1];
+    PADDLE_ENFORCE_EQ(
+        SortKthvalue<T>(dev_ctx,
+                        &trans_input,
+                        input_width,
+                        input_height,
+                        k,
+                        &trans_out,
+                        &trans_ind),
+        true,
+        phi::errors::External("KthvalueOP: Error when use cub sorting"));
+    funcs::TransCompute<phi::GPUContext, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans);
+    funcs::TransCompute<phi::GPUContext, T>(
+        ndims, dev_ctx, trans_out, output, trans);
+    if (!keepdim) {
+      output->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(kthvalue,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::KthvalueKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
new file mode 100644
index 0000000000000..c3f7a5261712a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_grad_kernel.h"
+
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context &dev_ctx,
+                         const DenseTensor &x,
+                         const DenseTensor &mean,
+                         const DenseTensor &variance,
+                         paddle::optional<const DenseTensor &> scale_opt,
+                         paddle::optional<const DenseTensor &> bias_opt,
+                         const DenseTensor &out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor *x_grad,
+                         DenseTensor *scale_grad,
+                         DenseTensor *bias_grad) {
+  using U = paddle::operators::LayerNormParamType<T>;
+  // d_x, d_scale, d_bias may be nullptr
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  auto *scale = scale_opt.get_ptr();
+  auto *bias = bias_opt.get_ptr();
+  auto *d_y = &out_grad;
+
+  const auto &x_dims = x.dims();
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+
+  auto *x_data = x.data<T>();
+  auto *d_y_data = d_y->data<T>();
+
+  auto *mean_data = mean.data<U>();
+  auto *var_data = variance.data<U>();
+
+  auto *d_x_data = (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));
+
+  auto x_dtype = x.dtype();
+
+  phi::DataType scale_bias_dtype;
+  if (scale != nullptr) {
+    scale_bias_dtype = scale->dtype();
+  } else {
+    // FIXME(zengjinle): do not find a better way to get the right
+    // data type of the d_scale and d_bias if scale == nullptr.
+    if (bias != nullptr) {
+      scale_bias_dtype = bias->dtype();
+    } else {
+      scale_bias_dtype = x_dtype;
+    }
+  }
+
+#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX)  \
+  do {                                                                      \
+    auto *scale_data =                                                      \
+        (scale == nullptr ? nullptr : scale->data<ScaleBiasT>());           \
+    auto *d_scale_data =                                                    \
+        (d_scale == nullptr ? nullptr                                       \
+                            : dev_ctx.template Alloc<ScaleBiasT>(d_scale)); \
+    auto *d_bias_data =                                                     \
+        (d_bias == nullptr ? nullptr                                        \
+                           : dev_ctx.template Alloc<ScaleBiasT>(d_bias));   \
+    auto *d_x_data =                                                        \
+        (d_x == nullptr ? nullptr : dev_ctx.template Alloc<T>(d_x));        \
+    paddle::operators::LayerNormBackward<T, U, IsScaleBiasSameDTypeWithX>(  \
+        x_data,                                                             \
+        d_y_data,                                                           \
+        scale_data,                                                         \
+        mean_data,                                                          \
+        var_data,                                                           \
+        d_x_data,                                                           \
+        d_scale_data,                                                       \
+        d_bias_data,                                                        \
+        epsilon,                                                            \
+        batch_size,                                                         \
+        feature_size,                                                       \
+        dev_ctx);                                                           \
+  } while (0)
+
+  if (scale_bias_dtype == x_dtype) {
+    PADDLE_LAUNCH_LAYERNORM_BWD(T, true);
+  } else {
+    PADDLE_LAUNCH_LAYERNORM_BWD(U, false);
+  }
+
+#undef PADDLE_LAUNCH_LAYERNORM_BWD
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(layer_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
new file mode 100644
index 0000000000000..d87b7c2193811
--- /dev/null
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -0,0 +1,229 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+
+#include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/layer_norm_util.h"
+
+namespace phi {
+
+template <typename T>
+void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                               const T *input,
+                                               std::vector<int> input_shape,
+                                               const T *bias,
+                                               const T *scale,
+                                               T *output,
+                                               T *mean,
+                                               T *variance,
+                                               int begin_norm_axis,
+                                               float eps) {
+  const auto x_dims = phi::make_ddim(input_shape);
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+  switch (paddle::operators::GetDesiredBlockDim(feature_size)) {
+    FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<
+                         T,
+                         T,
+                         kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+        input, scale, bias, output, mean, variance, eps, feature_size));
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Product from begin_norm_axis to end in layer_norm must be larger "
+          "than 1"));
+      break;
+  }
+}
+
+template class LayerNormDirectCUDAFunctor<float>;
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context &dev_ctx,
+                     const DenseTensor &x,
+                     paddle::optional<const DenseTensor &> scale_opt,
+                     paddle::optional<const DenseTensor &> bias_opt,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor *y,
+                     DenseTensor *mean,
+                     DenseTensor *var) {
+  using U = paddle::operators::LayerNormParamType<T>;
+  auto *scale = scale_opt.get_ptr();
+  auto *bias = bias_opt.get_ptr();
+
+  const auto x_dims = x.dims();
+  auto *x_data = x.data<T>();
+  auto *y_data = dev_ctx.template Alloc<T>(y);
+  auto *mean_data = dev_ctx.template Alloc<U>(mean);
+  auto *var_data = dev_ctx.template Alloc<U>(var);
+
+  auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
+  auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
+
+  auto x_dtype = x.dtype();
+  phi::DataType scale_bias_dtype;
+  if (void_scale_data != nullptr) {
+    scale_bias_dtype = scale->dtype();
+    if (void_bias_data != nullptr) {
+      PADDLE_ENFORCE_EQ(
+          scale->dtype(),
+          bias->dtype(),
+          phi::errors::InvalidArgument("Thie Scale and Bias of layer_norm op "
+                                       "should have the same data type."));
+    }
+  } else {
+    scale_bias_dtype = (void_bias_data != nullptr ? bias->dtype() : x_dtype);
+  }
+
+  bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
+  if (!is_scale_bias_same_dtype_with_x) {
+    PADDLE_ENFORCE_EQ(scale_bias_dtype,
+                      paddle::experimental::CppTypeToDataType<U>::Type(),
+                      phi::errors::InvalidArgument(
+                          "Unsupported data type of Scale and Bias"));
+  }
+
+  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
+  int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
+
+  auto stream = dev_ctx.stream();
+
+#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
+  do {                                                                     \
+    switch (paddle::operators::GetDesiredBlockDim(feature_size)) {         \
+      FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward<            \
+                           T,                                              \
+                           U,                                              \
+                           kBlockDim,                                      \
+                           IsScaleBiasSameDTypeWithX><<<batch_size,        \
+                                                        kBlockDim,         \
+                                                        0,                 \
+                                                        stream>>>(         \
+          x_data,                                                          \
+          static_cast<const ScaleBiasT *>(void_scale_data),                \
+          static_cast<const ScaleBiasT *>(void_bias_data),                 \
+          y_data,                                                          \
+          mean_data,                                                       \
+          var_data,                                                        \
+          epsilon,                                                         \
+          feature_size));                                                  \
+      default:                                                             \
+        PADDLE_THROW(phi::errors::InvalidArgument(                         \
+            "Product from begin_norm_axis to end must be larger than 1")); \
+        break;                                                             \
+    }                                                                      \
+  } while (0)
+
+#ifdef PADDLE_WITH_CUDA
+  bool can_call_1024_kernel = false;
+  if (feature_size == 1024 && scale != nullptr && bias != nullptr) {
+    can_call_1024_kernel = true;
+  }
+  if (can_call_1024_kernel) {
+    const int WARPS_M = 4;
+    const int WARPS_N = 1;
+    const int THREADS_PER_WARP = 32;
+    const int BYTES_PER_LDG = 16;
+    const int VecSize = BYTES_PER_LDG / sizeof(T);
+
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
+    const int ROWS_PER_CTA = WARPS_M;
+
+    const int grid = static_cast<int>(
+        std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));
+    if (is_scale_bias_same_dtype_with_x) {
+      paddle::operators::ln_fwd_1024_kernel<
+          T,
+          U,
+          T,
+          VecSize,
+          WARPS_M,
+          WARPS_N,
+          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+          batch_size,
+          feature_size,
+          epsilon,
+          x_data,
+          static_cast<const T *>(void_scale_data),
+          static_cast<const T *>(void_bias_data),
+          mean_data,
+          var_data,
+          y_data);
+    } else {
+      paddle::operators::ln_fwd_1024_kernel<
+          T,
+          U,
+          U,
+          VecSize,
+          WARPS_M,
+          WARPS_N,
+          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, stream>>>(
+          batch_size,
+          feature_size,
+          epsilon,
+          x_data,
+          static_cast<const U *>(void_scale_data),
+          static_cast<const U *>(void_bias_data),
+          mean_data,
+          var_data,
+          y_data);
+    }
+  } else {
+#endif
+    if (is_scale_bias_same_dtype_with_x) {
+      PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
+    } else {
+      PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
+    }
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+
+#undef PADDLE_LAUNCH_LAYERNORM_FWD
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(layer_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LayerNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
new file mode 100644
index 0000000000000..3e4cd21a658f1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(
+    lgamma_grad, GPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lgamma_kernel.cu b/paddle/phi/kernels/gpu/lgamma_kernel.cu
new file mode 100644
index 0000000000000..e94d67f4ce324
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lgamma_kernel.cu
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/lgamma_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+
+namespace phi {
+template <typename T>
+struct CudaLgammaFunctor {
+  __device__ __forceinline__ T operator()(const T x) const {
+    return Eigen::numext::lgamma(x);
+  }
+};
+template <typename T, typename Context>
+void LgammaKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out) {
+  // XKTODO( add gpu kernel implementation. )
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  auto functor = CudaLgammaFunctor<T>();
+  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(lgamma, GPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
new file mode 100644
index 0000000000000..f7b282536558d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxGradKernel(const Context &dev_ctx,
+                          const DenseTensor &out,
+                          const DenseTensor &out_grad,
+                          int axis,
+                          DenseTensor *x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  phi::SoftmaxBackwardCUDAKernelDriver<T, true>(
+      dev_ctx, out, out_grad, axis, x_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(log_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(log_softmax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
new file mode 100644
index 0000000000000..d7e34c6c14e7a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxKernel(const Context &dev_ctx,
+                      const DenseTensor &x,
+                      int axis,
+                      DenseTensor *out) {
+  dev_ctx.template Alloc<T>(out);
+  phi::SoftmaxForwardCUDAKernelDriver<T, true>(dev_ctx, x, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(log_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(log_softmax,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
index 71b7cd8750462..5a4ce3a2679b9 100644
--- a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
@@ -17,38 +17,30 @@
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/masked_select_grad_kernel.h"
 
 namespace phi {
 
-__global__ void SetMaskArrayT(const bool* mask, int32_t* mask_array, int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx])
-      mask_array[idx] = 1;
-    else
-      mask_array[idx] = 0;
-  }
-}
+template <typename MT, typename InT, typename OutT>
+struct MaskedSelectGradFunctor {
+  HOSTDEVICE MaskedSelectGradFunctor() {}
 
-template <typename T>
-__global__ void SelectGradWithPrefixMask(const int32_t* mask_prefix_sum,
-                                         const bool* mask,
-                                         const T* input,
-                                         T* out,
-                                         int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx]) {
-      int index = mask_prefix_sum[idx];
-      out[idx] = input[index];
-    } else {
-      out[idx] = 0;
+  HOSTDEVICE inline void operator()(OutT* out,
+                                    const MT* mask,
+                                    const InT* value,
+                                    int num) {
+    int read_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        out[idx] = value[read_fix++];
+      } else {
+        out[idx] = 0;
+      }
     }
   }
-}
+};
 
 template <typename T, typename Context>
 void MaskedSelectGradKernel(const Context& dev_ctx,
@@ -56,42 +48,12 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& mask,
                             DenseTensor* x_grad) {
-  auto* mask_data = mask.data<bool>();
-  auto* input_data = out_grad.data<T>();
-  auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
-
-  auto input_size = out_grad.numel();
   auto mask_size = mask.numel();
-  auto mask_dim = mask.dims();
-
-  auto out_size = mask_size;
-
-  DenseTensor mask_array;
-  DenseTensor mask_prefix_sum;
-  mask_array.Resize(mask_dim);
-  mask_prefix_sum.Resize(mask_dim);
-
-  int32_t* mask_array_data =
-      mask_array.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int32_t* mask_prefix_sum_data =
-      mask_prefix_sum.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int threads = 512;
-  int grid = (mask_size + threads - 1) / threads;
-  auto stream = dev_ctx.stream();
-  SetMaskArrayT<<<grid, threads, 0, stream>>>(
-      mask_data, mask_array_data, mask_size);
-
-  thrust::device_ptr<int32_t> mask_array_dev_ptr =
-      thrust::device_pointer_cast(mask_array_data);
-  thrust::device_vector<int32_t> mask_array_vec(mask_array_dev_ptr,
-                                                mask_array_dev_ptr + mask_size);
-  thrust::exclusive_scan(thrust::device,
-                         mask_array_vec.begin(),
-                         mask_array_vec.end(),
-                         mask_prefix_sum_data);
-
-  SelectGradWithPrefixMask<T><<<grid, threads, 0, stream>>>(
-      mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  dev_ctx.template Alloc<T>(x_grad);
+  if (mask_size <= 0) return;
+  using Functor = MaskedSelectGradFunctor<bool, T, T>;
+  phi::funcs::SelectKernel<bool, T, T, 2, Functor>(
+      dev_ctx, mask, out_grad, x_grad, Functor());
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu
index fc4adca2f4243..8986c97583e20 100644
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -17,36 +17,30 @@
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
 
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
+#include "paddle/phi/kernels/masked_select_kernel.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/masked_select_kernel.h"
 
 namespace phi {
 
-__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx])
-      mask_array[idx] = 1;
-    else
-      mask_array[idx] = 0;
-  }
-}
+template <typename MT, typename InT, typename OutT>
+struct MaskedSelectFunctor {
+  HOSTDEVICE MaskedSelectFunctor() {}
 
-template <typename T>
-__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum,
-                                     const bool* mask,
-                                     const T* input,
-                                     T* out,
-                                     int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx]) {
-      int index = mask_prefix_sum[idx];
-      out[index] = input[idx];
+  HOSTDEVICE inline void operator()(OutT* out,
+                                    const MT* mask,
+                                    const InT* value,
+                                    int num) {
+    int store_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        out[store_fix++] = value[idx];
+      }
     }
   }
-}
+};
 
 template <typename T, typename Context>
 void MaskedSelectKernel(const Context& dev_ctx,
@@ -68,42 +62,9 @@ void MaskedSelectKernel(const Context& dev_ctx,
                         "value.",
                         input_dim,
                         mask_dim));
-
-  thrust::device_ptr<const bool> mask_dev_ptr =
-      thrust::device_pointer_cast(mask_data);
-  thrust::device_vector<T> mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size);
-  auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true);
-
-  DDim out_dim{out_size};
-  out->Resize(out_dim);
-  auto out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-  DenseTensor mask_array;
-  DenseTensor mask_prefix_sum;
-  mask_array.Resize(mask_dim);
-  mask_prefix_sum.Resize(mask_dim);
-
-  int32_t* mask_array_data =
-      mask_array.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int32_t* mask_prefix_sum_data =
-      mask_prefix_sum.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int threads = 512;
-  int grid = (mask_size + threads - 1) / threads;
-  auto stream = dev_ctx.stream();
-  SetMaskArray<<<grid, threads, 0, stream>>>(
-      mask_data, mask_array_data, mask_size);
-
-  thrust::device_ptr<int32_t> mask_array_dev_ptr =
-      thrust::device_pointer_cast(mask_array_data);
-  thrust::device_vector<int32_t> mask_array_vec(mask_array_dev_ptr,
-                                                mask_array_dev_ptr + mask_size);
-  thrust::exclusive_scan(thrust::device,
-                         mask_array_vec.begin(),
-                         mask_array_vec.end(),
-                         mask_prefix_sum_data);
-
-  SelectWithPrefixMask<T><<<grid, threads, 0, stream>>>(
-      mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  using Functor = MaskedSelectFunctor<bool, T, T>;
+  phi::funcs::SelectKernel<bool, T, T, 1, Functor>(
+      dev_ctx, mask, x, out, Functor());
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
deleted file mode 100644
index af9d5574aa9fe..0000000000000
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/math_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-
-#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
-  template <typename T, typename Context>                            \
-  void name##RawKernel(const Context& dev_ctx,                       \
-                       const DenseTensor& x,                         \
-                       const DenseTensor& y,                         \
-                       int axis,                                     \
-                       DenseTensor* out) {                           \
-    std::vector<const DenseTensor*> inputs;                          \
-    std::vector<DenseTensor*> outputs;                               \
-    inputs.emplace_back(&x);                                         \
-    inputs.emplace_back(&y);                                         \
-    outputs.emplace_back(out);                                       \
-    dev_ctx.template Alloc<T>(out);                                  \
-    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
-        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
-  }
-
-/**
- * Kernels
- */
-
-template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out) {
-  phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-// Create the definition of Add
-DEFINE_CUDA_ELEMENTWISE_OP(Add)
-// Create the definition of Subtract
-DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
-// Create the definition of Multiply
-DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
-// Create the definition of Divide
-DEFINE_CUDA_ELEMENTWISE_OP(Divide)
-
-}  // namespace phi
-
-using float16 = phi::dtype::float16;
-using bfloat16 = phi::dtype::bfloat16;
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-PD_REGISTER_KERNEL(add_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   float16,
-                   complex64,
-                   complex128,
-                   bfloat16) {}
-PD_REGISTER_KERNEL(sum_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SumRawKernel,
-                   bool,
-                   float,
-                   double,
-                   float16,
-                   bfloat16,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-
-PD_REGISTER_KERNEL(mean_raw,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MeanRawKernel,
-                   float,
-                   double,
-                   bool,
-                   float16,
-                   int,
-                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index 2009547fc8d6f..66ba30f7ce694 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -23,12 +23,12 @@
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/abs_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h"
-#include "paddle/phi/kernels/math_kernel.h"
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/mode_grad_kernel.cu b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
new file mode 100644
index 0000000000000..43502621c2d3a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void AssignGradWithAxis(const T* grad_out,
+                                   const int64_t* indices,
+                                   T* grad_in,
+                                   int pre,
+                                   int post,
+                                   int raw_height,
+                                   int k) {
+  // raw_height is the length of topk axis
+  for (int i = blockIdx.x; i < pre; i += gridDim.x) {
+    int base_index = i * post * k;
+    int base_grad = i * post * raw_height;
+    for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) {
+      grad_in[base_grad + j] = static_cast<T>(0);
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < k * post; j += blockDim.x) {
+      int64_t idx_ij = indices[base_index + j];
+      int64_t in_ij = base_grad + (idx_ij * post) + (j % post);
+      grad_in[in_ij] = grad_out[base_index + j];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad) {
+  const auto& in_dims = x.dims();
+  auto out_dims = indices.dims();
+
+  if (axis < 0) axis += in_dims.size();
+  // allocate the cuda memory for the x_grad
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+  const T* out_grad_data = out_grad.data<T>();
+  const int64_t* indices_data = indices.data<int64_t>();
+
+  int pre, n, post;
+  funcs::GetDims(in_dims, axis, &pre, &n, &post);
+
+  // calcluate the block and grid num
+  int block_size = funcs::ComputeBlockSize(post);
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+  int grid_size = std::min(max_blocks, pre);
+  AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+      out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(mode_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ModeGradKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu
new file mode 100644
index 0000000000000..629b9722cd6bc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/mode_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/mode_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/mode.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices) {
+  // get the input dims
+  const auto& in_dims = x.dims();
+  // calcluate the real axis
+  if (axis < 0) axis += in_dims.size();
+
+  auto out_dims = out->dims();
+
+  const T* input_data = x.data<T>();
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (axis == in_dims.size() - 1) {
+    const int64_t& input_height =
+        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+    const int64_t& input_width = in_dims[in_dims.size() - 1];
+    funcs::GetModebySort<T>(
+        dev_ctx, &x, input_width, input_height, output_data, indices_data);
+  } else {
+    std::vector<int> trans_axis;
+    for (int i = 0; i < axis; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(in_dims.size() - 1);
+    for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+      trans_axis.emplace_back(i);
+    }
+    trans_axis.emplace_back(axis);
+
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(in_dims[i]);
+      }
+      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      out->Resize(tmp_out_dim);
+      indices->Resize(tmp_out_dim);
+    }
+
+    DDim trans_shape(in_dims);
+    DDim trans_out_shape(in_dims);
+    for (int i = 0; i < trans_axis.size(); i++) {
+      trans_shape[i] = in_dims[trans_axis[i]];
+      trans_out_shape[i] = in_dims[trans_axis[i]];
+    }
+    trans_out_shape[in_dims.size() - 1] = 1;
+
+    // second step, tranpose the input
+    DenseTensor trans_input;
+    trans_input.Resize(trans_shape);
+    dev_ctx.template Alloc<T>(&trans_input);
+
+    int ndims = trans_axis.size();
+    funcs::TransCompute<Context, T>(
+        ndims, dev_ctx, x, &trans_input, trans_axis);
+    DenseTensor trans_ind;
+    trans_ind.Resize(trans_out_shape);
+    int64_t* trans_ind_data = dev_ctx.template Alloc<int64_t>(&trans_ind);
+
+    DenseTensor trans_out;
+    trans_out.Resize(trans_out_shape);
+    T* trans_out_data = dev_ctx.template Alloc<T>(&trans_out);
+
+    const int64_t input_height =
+        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_width = trans_shape[trans_shape.size() - 1];
+    funcs::GetModebySort<T>(dev_ctx,
+                            &trans_input,
+                            input_width,
+                            input_height,
+                            trans_out_data,
+                            trans_ind_data);
+    // last step, tranpose back the indices and output
+    funcs::TransCompute<Context, int64_t>(
+        ndims, dev_ctx, trans_ind, indices, trans_axis);
+    funcs::TransCompute<Context, T>(ndims, dev_ctx, trans_out, out, trans_axis);
+    if (!keepdim) {
+      out->Resize(out_dims);
+      indices->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    mode, GPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index 4918495ff7bed..752a91fa48198 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -23,11 +23,32 @@ limitations under the License. */
 #include <thrust/scan.h>
 #include <thrust/transform.h>
 
-#include "paddle/fluid/platform/transform.h"
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/arg_min_max_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/inclusive_scan.h"
 #include "paddle/phi/kernels/funcs/multinomial_functor.h"
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/transform.h"
+
+DECLARE_bool(use_curand);
 
 namespace phi {
 
@@ -57,12 +78,12 @@ template <typename T>
 __global__ void GetCumulativeProbs(T* norm_probs_data,
                                    int64_t num_distributions,
                                    int64_t num_categories,
-                                   T* cumulative_probs) {
+                                   T* cumulative_probs_data) {
   int id = blockIdx.x;
   thrust::inclusive_scan(thrust::device,
                          norm_probs_data + id * num_categories,
                          norm_probs_data + (id + 1) * num_categories,
-                         cumulative_probs + id * num_categories);
+                         cumulative_probs_data + id * num_categories);
 }
 
 template <typename T>
@@ -80,7 +101,7 @@ struct RandomGeneratorCudaFunctor {
 };
 
 template <typename T>
-__device__ int binarySearchFunctor(T* cumulative_probs,
+__device__ int binarySearchFunctor(T* cumulative_probs_data,
                                    T* norm_probs_data,
                                    int num_categories,
                                    T rng_number) {
@@ -90,7 +111,7 @@ __device__ int binarySearchFunctor(T* cumulative_probs,
   while (right - left > 0) {
     int mid = left + (right - left) / 2;
 
-    T temp_prob = cumulative_probs[mid];
+    T temp_prob = cumulative_probs_data[mid];
     if (temp_prob < rng_number) {
       left = mid + 1;
     } else {
@@ -114,26 +135,35 @@ __global__ void sampleMultinomialWithReplacement(
     int64_t* out_data,
     const int64_t num_distributions,
     const int64_t num_categories,
-    T* cumulative_probs,
-    T* norm_probs_data) {
+    T* cumulative_probs_data,
+    T* norm_probs_data,
+    uint64_t seed,
+    uint64_t offset,
+    bool use_curand) {
   // use binary search to get the selected category sample id.
-  // let cumulative_probs[id-1] < rng_data < cumulative_probs[id].
+  // let cumulative_probs_data[id-1] < rng_data < cumulative_probs_data[id].
+  size_t idx = gridDim.x * blockDim.x * blockIdx.y + blockDim.x * blockIdx.x +
+               threadIdx.x;
 
-  // for every distribution
-  int dist = blockIdx.y;
-  // for every sample
-  int sample = blockIdx.x * blockDim.x + threadIdx.x;
-  if (sample < num_samples) {
-    T rng_number = rng_data[sample + dist * num_samples];
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, offset, &state);
 
-    // Find the bucket that a uniform random number lies in
-    int selected_category =
-        binarySearchFunctor<T>(cumulative_probs + dist * num_categories,
-                               norm_probs_data + dist * num_categories,
-                               num_categories,
-                               rng_number);
+  int sample = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int dist = blockIdx.y; dist < num_distributions; dist += gridDim.y) {
+    if (sample < num_samples) {
+      T rng_number = rng_data[sample + dist * num_samples];
+      if (use_curand) {
+        rng_number = static_cast<T>(curand_uniform4(&state).x);
+      }
+      // Find the bucket that a uniform random number lies in
+      int selected_category =
+          binarySearchFunctor<T>(cumulative_probs_data + dist * num_categories,
+                                 norm_probs_data + dist * num_categories,
+                                 num_categories,
+                                 rng_number);
 
-    out_data[sample + dist * num_samples] = selected_category;
+      out_data[sample + dist * num_samples] = selected_category;
+    }
   }
 }
 
@@ -172,6 +202,54 @@ void MultinomialKernel(const Context& dev_ctx,
                in_data_numel * sizeof(T),
                cudaMemcpyDeviceToHost);
 #endif
+    if (FLAGS_use_curand) {
+      for (size_t i = 0; i < num_distributions; ++i) {
+        int zero_num = 0;
+        for (size_t j = 0; j < num_categories; ++j) {
+          T weight = cpu_in_data[i * num_distributions + j];
+          PADDLE_ENFORCE_GE(
+              weight,
+              0,
+              errors::InvalidArgument(
+                  "Each element of multinomial'input must >= 0, but got %f.",
+                  weight));
+          if (weight == static_cast<T>(0)) {
+            zero_num++;
+          }
+        }
+        int valid_samples = num_categories - zero_num;
+        PADDLE_ENFORCE_LE(
+            num_samples,
+            valid_samples,
+            errors::InvalidArgument("When replacement=False, 'num_samples' "
+                                    "must less than or eaqual to the number of "
+                                    "positive item of input"));
+      }
+
+      // Refer to [gumbel softmax algorithm]
+      DenseTensor rand = EmptyLike<T, Context>(dev_ctx, x);
+      T* rand_data = rand.data<T>();
+      funcs::uniform_distribution<T> dist;
+      funcs::exponential_transform<T> trans(1.0);
+      funcs::distribution_and_transform<T>(dev_ctx, &rand, dist, trans);
+
+      funcs::ForRange<Context> for_range(dev_ctx, x.numel());
+      for_range([rand_data, in_data] __device__(size_t idx) {
+        rand_data[idx] = in_data[idx] / rand_data[idx];
+      });
+
+      if (num_samples == 1) {
+        ArgMaxKernel<T, Context>(
+            dev_ctx, rand, -1, true, false, 3 /*proto::VarType::INT64*/, out);
+      } else {
+        std::vector<int64_t> out_dim_vec = vectorize<int64_t>(out->dims());
+        DenseTensor value =
+            Empty<T, Context>(dev_ctx, ScalarArray(out_dim_vec));
+        TopkKernel<T, Context>(
+            dev_ctx, rand, Scalar(num_samples), -1, true, true, &value, out);
+      }
+      return;
+    }
 
     funcs::MultinomialFunctor<T>(dev_ctx,
                                  cpu_out_data,
@@ -228,7 +306,8 @@ void MultinomialKernel(const Context& dev_ctx,
   auto* norm_probs_data = dev_ctx.template Alloc<T>(&norm_probs_tensor);
 
   // number of threads in a block is min(num_categories, 512)
-  dim3 block_norm(num_categories < 512 ? num_categories : 512);
+  int block_size = num_categories < 512 ? num_categories : 512;
+  dim3 block_norm(block_size);
   dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
   NormalizeProbability<T><<<grid_norm, block_norm, 0, dev_ctx.stream()>>>(
       norm_probs_data,
@@ -238,16 +317,34 @@ void MultinomialKernel(const Context& dev_ctx,
       num_categories);
 
   // Get cumulative probability of each distribution. It's the same function
-  // of
-  // ``cumsum`` op.
+  // of ``cumsum`` op.
   DenseTensor cumulative_probs_tensor;
   cumulative_probs_tensor.Resize({num_distributions, num_categories});
-  auto* cumulative_probs = dev_ctx.template Alloc<T>(&cumulative_probs_tensor);
-
-  dim3 block_cumsum(1);
-  dim3 grid_cumsum(num_distributions);
-  GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0, dev_ctx.stream()>>>(
-      norm_probs_data, num_distributions, num_categories, cumulative_probs);
+  auto* cumulative_probs_data =
+      dev_ctx.template Alloc<T>(&cumulative_probs_tensor);
+
+  if (FLAGS_use_curand) {
+    // 'phi::funcs::InclusiveScan' has higher accuracy than
+    // 'thrust::inclusive_scan'
+    funcs::InclusiveScan<T, std::plus<T>>(
+        /*in*/ norm_probs_data,
+        /*out*/ cumulative_probs_data,
+        /*outer_dim*/ static_cast<size_t>(num_distributions),
+        /*mid_dim*/ static_cast<size_t>(num_categories),
+        /*inner_dim*/ static_cast<size_t>(1),
+        /*init*/ static_cast<T>(0),
+        std::plus<T>(),
+        /*reverse=*/false,
+        dev_ctx);
+  } else {
+    dim3 block_cumsum(1);
+    dim3 grid_cumsum(num_distributions);
+    GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0, dev_ctx.stream()>>>(
+        norm_probs_data,
+        num_distributions,
+        num_categories,
+        cumulative_probs_data);
+  }
 
   // Generate random number for each sample.
   std::random_device rd;
@@ -266,16 +363,30 @@ void MultinomialKernel(const Context& dev_ctx,
         RandomGeneratorCudaFunctor<T>(seed));
 
   // Sample the multinomial distributions.
-  dim3 block_sample(128);
-  dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions);
-  sampleMultinomialWithReplacement<
-      T><<<grid_sample, block_sample, 0, dev_ctx.stream()>>>(rng_data,
-                                                             num_samples,
-                                                             out_data,
-                                                             num_distributions,
-                                                             num_categories,
-                                                             cumulative_probs,
-                                                             norm_probs_data);
+  dim3 block(128);
+  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
+  const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
+  int grid_y = std::min<int64_t>(num_distributions, prop.maxGridSize[1]);
+  dim3 grid((num_samples - 1) / block.x + 1, grid_y);
+
+  auto gen_cuda = dev_ctx.GetGenerator();
+  size_t curand4_loop_times =
+      (num_distributions + 4 * grid_y - 1) / (4 * grid_y);
+  // 'increment' shoulde be multiple of 4
+  uint64_t increment = curand4_loop_times * 4;
+  auto seed_offset = gen_cuda->IncrementOffset(increment);
+
+  sampleMultinomialWithReplacement<T><<<grid, block, 0, dev_ctx.stream()>>>(
+      rng_data,
+      num_samples,
+      out_data,
+      num_distributions,
+      num_categories,
+      cumulative_probs_data,
+      norm_probs_data,
+      seed_offset.first,
+      seed_offset.second,
+      FLAGS_use_curand);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
new file mode 100644
index 0000000000000..21576ab608d26
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_grad_kernel.h"
+
+#include "paddle/phi/api/lib/utils/tensor_utils.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexGradKernel(const Context& ctx,
+                         const DenseTensor& ids,
+                         const DenseTensor& out_grad,
+                         std::vector<DenseTensor*> ins_grad) {
+  size_t idx = -1UL;
+  for (size_t i = 0; i < ins_grad.size(); i++) {
+    if (ins_grad[i]) {
+      ctx.template Alloc<T>(ins_grad[i]);
+      auto t = phi::EigenVector<T>::Flatten(*ins_grad[i]);
+      t.device(*ctx.eigen_device()) = t.constant(static_cast<T>(0));
+      idx = i;
+    }
+  }
+  if (idx == -1UL) return;
+
+  auto rows = ins_grad[idx]->dims()[0];
+  auto cols = ins_grad[idx]->numel() / rows;
+  DenseTensor index_t_cpu;
+  paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
+  auto* index = index_t_cpu.data<int32_t>();
+  auto stream = ctx.stream();
+  for (auto i = 0; i < rows; i++) {
+    size_t k = static_cast<size_t>(index[i]);
+    if (ins_grad[k]) {
+      paddle::memory::Copy(ctx.GetPlace(),
+                           ins_grad[k]->data<T>() + i * cols,
+                           ctx.GetPlace(),
+                           out_grad.data<T>() + i * cols,
+                           cols * sizeof(T),
+                           stream);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/multiplex_kernel.cu b/paddle/phi/kernels/gpu/multiplex_kernel.cu
new file mode 100644
index 0000000000000..743448a468666
--- /dev/null
+++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/multiplex_kernel.h"
+
+#include "paddle/phi/api/lib/utils/tensor_utils.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexKernel(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const DenseTensor& ids,
+                     DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  for (size_t i = 0; i < ins.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        ins[i]->numel(),
+        0,
+        errors::OutOfRange(
+            "indexing will be out of bounds with size 0 for the %d-th input.",
+            i));
+  }
+
+  auto rows = ins[0]->dims()[0];
+  auto cols = ins[0]->numel() / rows;
+  DenseTensor index_t_cpu;
+  paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
+  auto* index = index_t_cpu.data<int32_t>();
+  auto stream = ctx.stream();
+  for (auto i = 0; i < rows; i++) {
+    int32_t k = index[i];
+    PADDLE_ENFORCE_GE(
+        k, 0, errors::PreconditionNotMet("index must be nonnegative."));
+    PADDLE_ENFORCE_LT(static_cast<size_t>(k),
+                      ins.size(),
+                      errors::PreconditionNotMet(
+                          "index exceeds the number of candidate tensors."));
+    paddle::memory::Copy(ctx.GetPlace(),
+                         out->data<T>() + i * cols,
+                         ctx.GetPlace(),
+                         ins[k]->data<T>() + i * cols,
+                         cols * sizeof(T),
+                         stream);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiplex,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplexKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu
new file mode 100644
index 0000000000000..32c7fa1e85d15
--- /dev/null
+++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/one_hot_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename InT, typename OutT>
+__global__ void FillOutputKernel(const InT* p_in_data,
+                                 OutT* p_out_data,
+                                 const int64_t numel,
+                                 const int depth) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) {
+    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
+  }
+}
+
+template <typename DeviceContext, typename InT>
+struct OneHotV2OpCUDAFunctor {
+  const DenseTensor* in_;
+  DenseTensor* out_;
+  const DeviceContext& ctx_;
+  int depth_;
+
+  OneHotV2OpCUDAFunctor(const DenseTensor* in,
+                        DenseTensor* out,
+                        int depth,
+                        const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void apply() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = ctx_.template Alloc<OutT>(out_);
+    auto stream = ctx_.stream();
+    funcs::set_constant(ctx_, out_, 0.0);
+
+    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
+                           PADDLE_CUDA_NUM_THREADS,
+                       PADDLE_CUDA_NUM_THREADS,
+                       0,
+                       stream>>>(p_in_data, p_out_data, numel, depth_);
+  }
+};
+
+template <typename T, typename Context>
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out) {
+  auto out_dims = out->dims();
+  if (out_dims[out_dims.size() - 1] == -1) {
+    out_dims[out_dims.size() - 1] = depth;
+    out->Resize(out_dims);
+  }
+
+  phi::VisitDataType(
+      dtype, OneHotV2OpCUDAFunctor<Context, T>(&x, out, depth, dev_ctx));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    one_hot_raw, GPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
new file mode 100644
index 0000000000000..5ca8f3d73dade
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
@@ -0,0 +1,507 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_grad_kernel.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void Pad3DGradConstNCDHW(const int in_size,
+                                    T* d_in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int in_w = in_index % in_width;
+
+    int nc = in_index / in_width;
+    const int in_h = nc % in_height;
+
+    nc /= in_height;
+    const int in_d = nc % in_depth;
+
+    nc /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+    d_in_data[in_index] =
+        d_out_data[nc * out_depth * out_height * out_width +
+                   out_d * out_height * out_width + out_h * out_width + out_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradConstNDHWC(const int in_size,
+                                    T* d_in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    const T* d_out_data) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
+    const int c = in_index % channels;
+    int n = in_index / channels;
+
+    const int in_w = n % in_width;
+    n /= in_width;
+
+    const int in_h = n % in_height;
+    n /= in_height;
+
+    const int in_d = n % in_depth;
+    n /= in_depth;
+
+    const int out_d = in_d + pad_front;
+    const int out_h = in_h + pad_top;
+    const int out_w = in_w + pad_left;
+
+    d_in_data[in_index] =
+        d_out_data[n * out_depth * out_height * out_width * channels +
+                   out_d * out_height * out_width * channels +
+                   out_h * out_width * channels + out_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNCDHW(const int out_size,
+                                      T* d_in_data,
+                                      const int num,
+                                      const int channels,
+                                      const int in_depth,
+                                      const int in_height,
+                                      const int in_width,
+                                      const int out_depth,
+                                      const int out_height,
+                                      const int out_width,
+                                      const int pad_front,
+                                      const int pad_top,
+                                      const int pad_left,
+                                      const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReflectNDHWC(const int out_size,
+                                      T* d_in_data,
+                                      const int num,
+                                      const int channels,
+                                      const int in_depth,
+                                      const int in_height,
+                                      const int in_width,
+                                      const int out_depth,
+                                      const int out_height,
+                                      const int out_width,
+                                      const int pad_front,
+                                      const int pad_top,
+                                      const int pad_left,
+                                      const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+
+    in_d = min(in_d, in_depth * 2 - in_d - 2);
+    in_h = min(in_h, in_height * 2 - in_h - 2);
+    in_w = min(in_w, in_width * 2 - in_w - 2);
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNCDHW(const int out_size,
+                                        T* d_in_data,
+                                        const int num,
+                                        const int channels,
+                                        const int in_depth,
+                                        const int in_height,
+                                        const int in_width,
+                                        const int out_depth,
+                                        const int out_height,
+                                        const int out_width,
+                                        const int pad_front,
+                                        const int pad_top,
+                                        const int pad_left,
+                                        const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradReplicateNDHWC(const int out_size,
+                                        T* d_in_data,
+                                        const int num,
+                                        const int channels,
+                                        const int in_depth,
+                                        const int in_height,
+                                        const int in_width,
+                                        const int out_depth,
+                                        const int out_height,
+                                        const int out_width,
+                                        const int pad_front,
+                                        const int pad_top,
+                                        const int pad_left,
+                                        const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    const int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNCDHW(const int out_size,
+                                       T* d_in_data,
+                                       const int num,
+                                       const int channels,
+                                       const int in_depth,
+                                       const int in_height,
+                                       const int in_width,
+                                       const int out_depth,
+                                       const int out_height,
+                                       const int out_width,
+                                       const int pad_front,
+                                       const int pad_top,
+                                       const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[nc * in_depth * in_height * in_width +
+                   in_d * in_height * in_width + in_h * in_width + in_w],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad3DGradCircularNDHWC(const int out_size,
+                                       T* d_in_data,
+                                       const int num,
+                                       const int channels,
+                                       const int in_depth,
+                                       const int in_height,
+                                       const int in_width,
+                                       const int out_depth,
+                                       const int out_height,
+                                       const int out_width,
+                                       const int pad_front,
+                                       const int pad_top,
+                                       const int pad_left,
+                                       const T* d_out_data) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    paddle::platform::CudaAtomicAdd(
+        &d_in_data[n * in_depth * in_height * in_width * channels +
+                   in_d * in_height * in_width * channels +
+                   in_h * in_width * channels + in_w * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& out_grad,
+                     const ScalarArray& paddings,
+                     const std::string& mode,
+                     float pad_value,
+                     const std::string& data_format,
+                     DenseTensor* x_grad) {
+  std::vector<int64_t> pads = paddings.GetData();
+  auto* d_out = &out_grad;
+  auto* d_in = x_grad;
+  auto d_in_dims = d_in->dims();
+  auto d_out_dims = d_out->dims();
+  const T* d_out_data = d_out->data<T>();
+  T* d_in_data = dev_ctx.template Alloc<T>(d_in);
+
+  phi::funcs::SetConstant<Context, T>()(dev_ctx, d_in, static_cast<T>(0));
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+
+  const int num = d_in_dims[0];
+
+  auto stream = dev_ctx.stream();
+  int block = PADDLE_CUDA_NUM_THREADS;
+  const int out_size = d_out->numel();
+  const int in_size = d_in->numel();
+  int grid = (out_size + block - 1) / block;
+
+  if (data_format == "NCDHW") {
+    const int channels = d_in_dims[1];
+    const int in_depth = d_in_dims[2];
+    const int in_height = d_in_dims[3];
+    const int in_width = d_in_dims[4];
+    const int out_depth = d_out_dims[2];
+    const int out_height = d_out_dims[3];
+    const int out_width = d_out_dims[4];
+
+    if (mode == "reflect") {
+      Pad3DGradReflectNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                           d_in_data,
+                                                           num,
+                                                           channels,
+                                                           in_depth,
+                                                           in_height,
+                                                           in_width,
+                                                           out_depth,
+                                                           out_height,
+                                                           out_width,
+                                                           pad_front,
+                                                           pad_top,
+                                                           pad_left,
+                                                           d_out_data);
+    } else if (mode == "replicate") {
+      Pad3DGradReplicateNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                             d_in_data,
+                                                             num,
+                                                             channels,
+                                                             in_depth,
+                                                             in_height,
+                                                             in_width,
+                                                             out_depth,
+                                                             out_height,
+                                                             out_width,
+                                                             pad_front,
+                                                             pad_top,
+                                                             pad_left,
+                                                             d_out_data);
+    } else if (mode == "circular") {
+      Pad3DGradCircularNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                            d_in_data,
+                                                            num,
+                                                            channels,
+                                                            in_depth,
+                                                            in_height,
+                                                            in_width,
+                                                            out_depth,
+                                                            out_height,
+                                                            out_width,
+                                                            pad_front,
+                                                            pad_top,
+                                                            pad_left,
+                                                            d_out_data);
+    } else {
+      grid = (in_size + block - 1) / block;
+      Pad3DGradConstNCDHW<T><<<grid, block, 0, stream>>>(in_size,
+                                                         d_in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         d_out_data);
+    }
+  } else {
+    const int channels = d_in_dims[4];
+    const int in_depth = d_in_dims[1];
+    const int in_height = d_in_dims[2];
+    const int in_width = d_in_dims[3];
+    const int out_depth = d_out_dims[1];
+    const int out_height = d_out_dims[2];
+    const int out_width = d_out_dims[3];
+    if (mode == "reflect") {
+      Pad3DGradReflectNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                           d_in_data,
+                                                           num,
+                                                           channels,
+                                                           in_depth,
+                                                           in_height,
+                                                           in_width,
+                                                           out_depth,
+                                                           out_height,
+                                                           out_width,
+                                                           pad_front,
+                                                           pad_top,
+                                                           pad_left,
+                                                           d_out_data);
+    } else if (mode == "replicate") {
+      Pad3DGradReplicateNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                             d_in_data,
+                                                             num,
+                                                             channels,
+                                                             in_depth,
+                                                             in_height,
+                                                             in_width,
+                                                             out_depth,
+                                                             out_height,
+                                                             out_width,
+                                                             pad_front,
+                                                             pad_top,
+                                                             pad_left,
+                                                             d_out_data);
+    } else if (mode == "circular") {
+      Pad3DGradCircularNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                            d_in_data,
+                                                            num,
+                                                            channels,
+                                                            in_depth,
+                                                            in_height,
+                                                            in_width,
+                                                            out_depth,
+                                                            out_height,
+                                                            out_width,
+                                                            pad_front,
+                                                            pad_top,
+                                                            pad_left,
+                                                            d_out_data);
+    } else {
+      grid = (in_size + block - 1) / block;
+      Pad3DGradConstNDHWC<T><<<grid, block, 0, stream>>>(in_size,
+                                                         d_in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         d_out_data);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    pad3d_grad, GPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu
new file mode 100644
index 0000000000000..8f7cf716e79cf
--- /dev/null
+++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu
@@ -0,0 +1,591 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_kernel.h"
+
+#include <algorithm>
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void Pad3DConstNCDHW(const int nthreads,
+                                const T* in_data,
+                                const int num,
+                                const int channels,
+                                const int in_depth,
+                                const int in_height,
+                                const int in_width,
+                                const int out_depth,
+                                const int out_height,
+                                const int out_width,
+                                const int pad_front,
+                                const int pad_top,
+                                const int pad_left,
+                                T value,
+                                T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[nc * in_depth * in_height * in_width +
+                      in_d * in_height * in_width + in_h * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DConstNDHWC(const int nthreads,
+                                const T* in_data,
+                                const int num,
+                                const int channels,
+                                const int in_depth,
+                                const int in_height,
+                                const int in_width,
+                                const int out_depth,
+                                const int out_height,
+                                const int out_width,
+                                const int pad_front,
+                                const int pad_top,
+                                const int pad_left,
+                                T value,
+                                T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    const int in_d = out_d - pad_front;
+    const int in_h = out_h - pad_top;
+    const int in_w = out_w - pad_left;
+
+    out_data[index] =
+        (in_d < 0 || in_h < 0 || in_w < 0 || in_d >= in_depth ||
+         in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[n * in_depth * in_height * in_width * channels +
+                      in_d * in_height * in_width * channels +
+                      in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNCDHW(const int nthreads,
+                                  const T* in_data,
+                                  const int num,
+                                  const int channels,
+                                  const int in_depth,
+                                  const int in_height,
+                                  const int in_width,
+                                  const int out_depth,
+                                  const int out_height,
+                                  const int out_width,
+                                  const int pad_front,
+                                  const int pad_top,
+                                  const int pad_left,
+                                  T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);                     // reflect by 0
+    in_d = min(in_d, 2 * in_depth - in_d - 2);   // reflect by in_depth
+    in_h = max(in_h, -in_h);                     // reflect by 0
+    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+    in_w = max(in_w, -in_w);                     // reflect by 0
+    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReflectNDHWC(const int nthreads,
+                                  const T* in_data,
+                                  const int num,
+                                  const int channels,
+                                  const int in_depth,
+                                  const int in_height,
+                                  const int in_width,
+                                  const int out_depth,
+                                  const int out_height,
+                                  const int out_width,
+                                  const int pad_front,
+                                  const int pad_top,
+                                  const int pad_left,
+                                  T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+    int in_d = out_d - pad_front;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+
+    in_d = max(in_d, -in_d);
+    in_d = min(in_d, 2 * in_depth - in_d - 2);
+    in_h = max(in_h, -in_h);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = max(in_w, -in_w);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNCDHW(const int nthreads,
+                                    const T* in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DReplicateNDHWC(const int nthreads,
+                                    const T* in_data,
+                                    const int num,
+                                    const int channels,
+                                    const int in_depth,
+                                    const int in_height,
+                                    const int in_width,
+                                    const int out_depth,
+                                    const int out_height,
+                                    const int out_width,
+                                    const int pad_front,
+                                    const int pad_top,
+                                    const int pad_left,
+                                    T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = min(in_depth - 1, max(out_d - pad_front, 0));
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNCDHW(const int nthreads,
+                                   const T* in_data,
+                                   const int num,
+                                   const int channels,
+                                   const int in_depth,
+                                   const int in_height,
+                                   const int in_width,
+                                   const int out_depth,
+                                   const int out_height,
+                                   const int out_width,
+                                   const int pad_front,
+                                   const int pad_top,
+                                   const int pad_left,
+                                   T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int out_d = nc % out_depth;
+    nc /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] =
+        in_data[(nc * in_depth * in_height + in_d * in_height + in_h) *
+                    in_width +
+                in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad3DCircularNDHWC(const int nthreads,
+                                   const T* in_data,
+                                   const int num,
+                                   const int channels,
+                                   const int in_depth,
+                                   const int in_height,
+                                   const int in_width,
+                                   const int out_depth,
+                                   const int out_height,
+                                   const int out_width,
+                                   const int pad_front,
+                                   const int pad_top,
+                                   const int pad_left,
+                                   T* out_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int out_d = n % out_depth;
+    n /= out_depth;
+
+    int in_d = ((out_d - pad_front) % in_depth + in_depth) % in_depth;
+    int in_h = ((out_h - pad_top) % in_height + in_height) % in_height;
+    int in_w = ((out_w - pad_left) % in_width + in_width) % in_width;
+
+    out_data[index] = in_data[n * in_depth * in_height * in_width * channels +
+                              in_d * in_height * in_width * channels +
+                              in_h * in_width * channels + in_w * channels + c];
+  }
+}
+
+template <typename T, typename Context>
+void Pad3dKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const ScalarArray& paddings,
+                 const std::string& mode,
+                 float pad_value,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  std::vector<int64_t> pads = paddings.GetData();
+
+  auto in_dims = x.dims();
+  const T* in_data = x.data<T>();
+  auto out_dims = out->dims();
+  T value = static_cast<T>(pad_value);
+
+  if (data_format == "NCDHW") {
+    out_dims[0] = in_dims[0];
+    out_dims[1] = in_dims[1];
+    out_dims[2] = in_dims[2] + pads[4] + pads[5];
+    out_dims[3] = in_dims[3] + pads[2] + pads[3];
+    out_dims[4] = in_dims[4] + pads[0] + pads[1];
+  } else {
+    out_dims[0] = in_dims[0];
+    out_dims[1] = in_dims[1] + pads[4] + pads[5];
+    out_dims[2] = in_dims[2] + pads[2] + pads[3];
+    out_dims[3] = in_dims[3] + pads[0] + pads[1];
+    out_dims[4] = in_dims[4];
+  }
+  out->Resize(out_dims);
+  T* out_data = dev_ctx.template Alloc<T>(out);
+
+  int channels = in_dims[1];
+  int in_depth = in_dims[2];
+  int in_height = in_dims[3];
+  int in_width = in_dims[4];
+  int out_depth = out_dims[2];
+  int out_height = out_dims[3];
+  int out_width = out_dims[4];
+  if (data_format == "NDHWC") {
+    channels = in_dims[4];
+    in_depth = in_dims[1];
+    in_height = in_dims[2];
+    in_width = in_dims[3];
+    out_depth = out_dims[1];
+    out_height = out_dims[2];
+    out_width = out_dims[3];
+  }
+
+  if (mode == "reflect") {
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[4],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_front"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_front(%d).",
+                                in_depth,
+                                pads[4]));
+    PADDLE_ENFORCE_GT(
+        in_depth,
+        pads[5],
+        errors::InvalidArgument("The depth of Input(X)'s dimension should be "
+                                "greater than pad_back"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_back(%d).",
+                                in_depth,
+                                pads[5]));
+
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[2],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_top"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_top(%d).",
+                                in_height,
+                                pads[2]));
+    PADDLE_ENFORCE_GT(
+        in_height,
+        pads[3],
+        errors::InvalidArgument("The height of Input(X)'s dimension should be "
+                                "greater than pad_bottom"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_bottom(%d).",
+                                in_height,
+                                pads[3]));
+
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[0],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_left"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_left(%d).",
+                                in_width,
+                                pads[0]));
+    PADDLE_ENFORCE_GT(
+        in_width,
+        pads[1],
+        errors::InvalidArgument("The width of Input(X)'s dimension should be "
+                                "greater than pad_right"
+                                " in reflect mode"
+                                ", but received depth(%d) and pad_right(%d).",
+                                in_width,
+                                pads[1]));
+  } else if (mode == "circular" || mode == "replicate") {
+    PADDLE_ENFORCE_NE(in_depth * in_height * in_width,
+                      0,
+                      errors::InvalidArgument(
+                          "The input tensor size can not be 0 for circular "
+                          "or replicate padding mode."));
+  }
+
+  const int pad_left = pads[0];
+  const int pad_top = pads[2];
+  const int pad_front = pads[4];
+  const int num = in_dims[0];
+
+  auto stream = dev_ctx.stream();
+  int block = PADDLE_CUDA_NUM_THREADS;
+  const int out_size = out->numel();
+  int grid = (out_size + block - 1) / block;
+
+  if (data_format == "NCDHW") {
+    if (mode == "reflect") {
+      Pad3DReflectNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                       in_data,
+                                                       num,
+                                                       channels,
+                                                       in_depth,
+                                                       in_height,
+                                                       in_width,
+                                                       out_depth,
+                                                       out_height,
+                                                       out_width,
+                                                       pad_front,
+                                                       pad_top,
+                                                       pad_left,
+                                                       out_data);
+    } else if (mode == "replicate") {
+      Pad3DReplicateNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                         in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         out_data);
+    } else if (mode == "circular") {
+      Pad3DCircularNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                        in_data,
+                                                        num,
+                                                        channels,
+                                                        in_depth,
+                                                        in_height,
+                                                        in_width,
+                                                        out_depth,
+                                                        out_height,
+                                                        out_width,
+                                                        pad_front,
+                                                        pad_top,
+                                                        pad_left,
+                                                        out_data);
+    } else {
+      Pad3DConstNCDHW<T><<<grid, block, 0, stream>>>(out_size,
+                                                     in_data,
+                                                     num,
+                                                     channels,
+                                                     in_depth,
+                                                     in_height,
+                                                     in_width,
+                                                     out_depth,
+                                                     out_height,
+                                                     out_width,
+                                                     pad_front,
+                                                     pad_top,
+                                                     pad_left,
+                                                     value,
+                                                     out_data);
+    }
+  } else {
+    if (mode == "reflect") {
+      Pad3DReflectNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                       in_data,
+                                                       num,
+                                                       channels,
+                                                       in_depth,
+                                                       in_height,
+                                                       in_width,
+                                                       out_depth,
+                                                       out_height,
+                                                       out_width,
+                                                       pad_front,
+                                                       pad_top,
+                                                       pad_left,
+                                                       out_data);
+    } else if (mode == "replicate") {
+      Pad3DReplicateNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                         in_data,
+                                                         num,
+                                                         channels,
+                                                         in_depth,
+                                                         in_height,
+                                                         in_width,
+                                                         out_depth,
+                                                         out_height,
+                                                         out_width,
+                                                         pad_front,
+                                                         pad_top,
+                                                         pad_left,
+                                                         out_data);
+    } else if (mode == "circular") {
+      Pad3DCircularNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                        in_data,
+                                                        num,
+                                                        channels,
+                                                        in_depth,
+                                                        in_height,
+                                                        in_width,
+                                                        out_depth,
+                                                        out_height,
+                                                        out_width,
+                                                        pad_front,
+                                                        pad_top,
+                                                        pad_left,
+                                                        out_data);
+    } else {
+      Pad3DConstNDHWC<T><<<grid, block, 0, stream>>>(out_size,
+                                                     in_data,
+                                                     num,
+                                                     channels,
+                                                     in_depth,
+                                                     in_height,
+                                                     in_width,
+                                                     out_depth,
+                                                     out_height,
+                                                     out_width,
+                                                     pad_front,
+                                                     pad_top,
+                                                     pad_left,
+                                                     value,
+                                                     out_data);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(pad3d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pad3dKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/prelu_funcs.h b/paddle/phi/kernels/gpu/prelu_funcs.h
new file mode 100644
index 0000000000000..76ee9439a2050
--- /dev/null
+++ b/paddle/phi/kernels/gpu/prelu_funcs.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+#define CUDA_NUM_THREADS 1024
+
+inline static int PADDLE_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__global__ void PReluChannelFirstWiseKernel(const T *input,
+                                            const T *alpha,
+                                            T *output,
+                                            size_t channel_num,
+                                            size_t plane_size,
+                                            size_t numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    size_t temp = index / plane_size;
+    size_t channel_index = temp % channel_num;
+    T scale = alpha[channel_index];
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluChannelLastWiseKernel(const T *input,
+                                           const T *alpha,
+                                           T *output,
+                                           size_t channel_num,
+                                           size_t numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    size_t channel_index = index % channel_num;
+    T scale = alpha[channel_index];
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluElementWiseKernel(const T *input,
+                                       const T *alpha,
+                                       T *output,
+                                       size_t spatial_size,
+                                       size_t numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    size_t element_index = index % spatial_size;
+    T scale = alpha[element_index];
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluScalarKernel(const T *input,
+                                  const T *alpha,
+                                  T *output,
+                                  size_t numel) {
+  T scale = alpha[0];
+  CUDA_KERNEL_LOOP(index, numel) {
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
+
+template <typename T>
+class PreluChannelWiseDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T *input,
+                  const T *alpha,
+                  T *output,
+                  size_t batch_size,
+                  size_t channel,
+                  bool channel_last,
+                  size_t numel);
+};
+
+template <typename T>
+class PreluElementWiseDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T *input,
+                  const T *alpha,
+                  T *output,
+                  size_t batch_size,
+                  size_t numel);
+};
+
+template <typename T>
+class PreluScalarDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T *input,
+                  const T *alpha,
+                  T *output,
+                  size_t numel);
+};
+
+template <typename T>
+void PreluChannelWiseDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                                      const T *input,
+                                                      const T *alpha,
+                                                      T *output,
+                                                      size_t batch_size,
+                                                      size_t channel,
+                                                      bool channel_last,
+                                                      size_t numel) {
+  if (channel_last) {
+    PReluChannelLastWiseKernel<<<PADDLE_GET_BLOCKS(numel),
+                                 CUDA_NUM_THREADS,
+                                 0,
+                                 stream>>>(
+        input, alpha, output, channel, numel);
+  } else {
+    PReluChannelFirstWiseKernel<<<PADDLE_GET_BLOCKS(numel),
+                                  CUDA_NUM_THREADS,
+                                  0,
+                                  stream>>>(
+        input, alpha, output, channel, numel / batch_size / channel, numel);
+  }
+}
+
+template <typename T>
+void PreluElementWiseDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                                      const T *input,
+                                                      const T *alpha,
+                                                      T *output,
+                                                      size_t batch_size,
+                                                      size_t numel) {
+  PReluElementWiseKernel<<<PADDLE_GET_BLOCKS(numel),
+                           CUDA_NUM_THREADS,
+                           0,
+                           stream>>>(
+      input, alpha, output, numel / batch_size, numel);
+}
+
+template <typename T>
+void PreluScalarDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
+                                                 const T *input,
+                                                 const T *alpha,
+                                                 T *output,
+                                                 size_t numel) {
+  PReluScalarKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, numel);
+}
+
+template class PreluChannelWiseDirectCUDAFunctor<float>;
+template class PreluChannelWiseDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluChannelWiseDirectCUDAFunctor<double>;
+
+template class PreluElementWiseDirectCUDAFunctor<float>;
+template class PreluElementWiseDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluElementWiseDirectCUDAFunctor<double>;
+
+template class PreluScalarDirectCUDAFunctor<float>;
+template class PreluScalarDirectCUDAFunctor<phi::dtype::float16>;
+template class PreluScalarDirectCUDAFunctor<double>;
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
new file mode 100644
index 0000000000000..d8661268e82c3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/prelu_funcs.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+enum PRELU_MODE { Element, ChannelFirst, ChannelLast, PRELU_Scalar };
+
+template <typename T>
+__global__ void PReluOpGradKernel(const T* x_ptr,
+                                  const T* alpha_ptr,
+                                  const T* out_grad_ptr,
+                                  T* x_grad_ptr,
+                                  T* alpha_grad_ptr,
+                                  size_t channel_num,
+                                  size_t plane_size,
+                                  size_t spatial_size,
+                                  size_t numel,
+                                  PRELU_MODE mode) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    T scale;
+    if (mode == Element) {
+      size_t element_index = index % spatial_size;
+      scale = alpha_ptr[element_index];
+    } else if (mode == ChannelFirst) {
+      size_t temp = index / plane_size;
+      size_t channel_index = temp % channel_num;
+      scale = alpha_ptr[channel_index];
+    } else if (mode == ChannelLast) {
+      size_t channel_index = index % channel_num;
+      scale = alpha_ptr[channel_index];
+    } else {
+      scale = alpha_ptr[0];
+    }
+    T x = x_ptr[index];
+    T out_grad = out_grad_ptr[index];
+    T zero = static_cast<T>(0);
+    if (x_grad_ptr != nullptr)
+      x_grad_ptr[index] = (x > zero) ? out_grad : scale * out_grad;
+    if (alpha_grad_ptr != nullptr)
+      alpha_grad_ptr[index] = (x > zero) ? zero : x * out_grad;
+  }
+}
+
+template <typename T>
+class PreluOpGradFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T* x,
+                  const T* alpha,
+                  const T* out_grad,
+                  T* x_grad,
+                  T* alpha_grad,
+                  const DDim& input_dims,
+                  PRELU_MODE mode) {
+    size_t numel = 1;
+    for (size_t i = 0; i < input_dims.size(); ++i) {
+      numel *= input_dims[i];
+    }
+    size_t plane_size = numel / input_dims[0] / input_dims[1];
+    size_t spatial_size = numel / input_dims[0];
+    size_t channel =
+        mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1];
+
+    PReluOpGradKernel<
+        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+        x,
+        alpha,
+        out_grad,
+        x_grad,
+        alpha_grad,
+        channel,
+        plane_size,
+        spatial_size,
+        numel,
+        mode);
+  }
+};
+
+template <typename T, typename Context>
+void PReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& alpha,
+                     const DenseTensor& out_grad,
+                     const std::string& mode,
+                     const std::string& data_format,
+                     DenseTensor* x_grad,
+                     DenseTensor* alpha_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+
+  const T* x_ptr = x.data<T>();
+  const T* alpha_ptr = alpha.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = x_grad ? dev_ctx.template Alloc<T>(x_grad) : nullptr;
+  T* alpha_grad_ptr =
+      alpha_grad ? dev_ctx.template Alloc<T>(alpha_grad) : nullptr;
+
+  if (!x_grad && !alpha_grad) return;
+
+  int numel = x.numel();
+  auto dim = x.dims();
+  auto x_rank = dim.size();
+  std::vector<int> input_shape = phi::vectorize<int>(dim);
+  auto stream = dev_ctx.stream();
+
+  T* alpha_grad_tmp_ptr;
+  DenseTensor alpha_grad_tmp;
+  if (alpha_grad_ptr == nullptr) {
+    alpha_grad_tmp_ptr = alpha_grad_ptr;
+  } else {
+    DenseTensorMeta alpha_grad_meta(
+        alpha_grad->dtype(), dim, alpha_grad->layout());
+    alpha_grad_tmp = phi::Empty(dev_ctx, std::move(alpha_grad_meta));
+    alpha_grad_tmp_ptr = alpha_grad_tmp.data<T>();
+  }
+
+  PRELU_MODE m;
+  bool channel_last = false;
+  if (mode == "element") {
+    m = Element;
+  } else if (mode == "channel") {
+    channel_last = data_format == "NHWC";
+    m = channel_last ? ChannelLast : ChannelFirst;
+  } else {
+    m = PRELU_Scalar;
+  }
+  PreluOpGradFunctor<T> prelu_grad;
+  prelu_grad(stream,
+             x_ptr,
+             alpha_ptr,
+             out_grad_ptr,
+             x_grad_ptr,
+             alpha_grad_tmp_ptr,
+             dim,
+             m);
+
+  if (alpha_grad_tmp_ptr == nullptr) return;
+
+  std::vector<int> reduce_dims;
+  for (size_t i = 0; i < dim.size(); i++) {
+    if (mode == "channel" && !channel_last && i == 1) continue;
+    if (mode == "channel" && channel_last && i == dim.size() - 1) continue;
+    if (mode == "element" && i != 0) continue;
+    reduce_dims.push_back(i);
+  }
+
+  phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      static_cast<const phi::GPUContext&>(dev_ctx),
+      alpha_grad_tmp,
+      alpha_grad,
+      kps::IdentityFunctor<T>(),
+      reduce_dims);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(prelu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PReluGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu
new file mode 100644
index 0000000000000..8255a7ba2ed96
--- /dev/null
+++ b/paddle/phi/kernels/gpu/prelu_kernel.cu
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prelu_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/prelu_funcs.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& alpha,
+                 const std::string& mode,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  const T* x_ptr = x.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+
+  const T* alpha_ptr = alpha.data<T>();
+  int numel = x.numel();
+  auto dim = x.dims();
+  auto x_rank = dim.size();
+
+  VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim["
+          << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel;
+
+  if (mode == "channel") {
+    bool channel_last = data_format == "NHWC";
+    size_t channel = channel_last ? dim[x_rank - 1] : dim[1];
+    PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
+    prelu_channel_wise(dev_ctx.stream(),
+                       x_ptr,
+                       alpha_ptr,
+                       o_ptr,
+                       dim[0],
+                       channel,
+                       channel_last,
+                       numel);
+  } else if (mode == "element") {
+    PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
+    prelu_element_wise(
+        dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, dim[0], numel);
+  } else {
+    PreluScalarDirectCUDAFunctor<T> prelu_scalar;
+    prelu_scalar(dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, numel);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(prelu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PReluKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index d4d90cac917a2..92948bf47c934 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -14,37 +14,161 @@
 
 #include "paddle/phi/kernels/randperm_kernel.h"
 
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/randint_kernel.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"
 
+DECLARE_bool(use_curand);
+
 namespace phi {
 
+template <typename T>
+__global__ void SwapRepeatKernel(
+    int* key, T* data, int n, uint64_t seed, uint64_t offset) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < n) return;
+
+  bool first_repeat = false;
+  if (data[idx] == data[idx + 1]) {
+    if (idx == 0) {
+      first_repeat = true;
+    } else if (data[idx] != data[idx - 1]) {
+      first_repeat = true;
+    }
+  }
+
+  if (!first_repeat) return;
+
+  int repeat_size = 1;
+  for (int i = idx; i < n; ++i) {
+    if (data[i] == data[i + 1]) {
+      ++repeat_size;
+    } else {
+      break;
+    }
+  }
+
+#ifdef __NVCC__
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, offset, &state);
+  for (int i = repeat_size - 1; i > 0; i--) {
+    uint32_t r = curand(&state) % (i + 1);
+#elif __HIPCC__
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, idx, offset, &state);
+  for (int i = repeat_size - 1; i > 0; i--) {
+    uint32_t r = hiprand(&state) % (i + 1);
+#endif
+    if (r != i) {
+      T tmp = data[idx + i];
+      data[idx + i] = data[idx + r];
+      data[idx + r] = tmp;
+    }
+  }
+}
+
 template <typename T, typename Context>
 void RandpermRawKernel(
     const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) {
-  DenseTensor tmp;
-  tmp.Resize(phi::make_ddim({n}));
-  T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
-
-  std::shared_ptr<std::mt19937_64> engine;
-  if (seed) {
-    engine = std::make_shared<std::mt19937_64>();
-    engine->seed(seed);
+  if (FLAGS_use_curand) {
+    DenseTensor key;
+    RandintKernel<int, Context>(dev_ctx,
+                                std::numeric_limits<int>::min(),
+                                std::numeric_limits<int>::max(),
+                                ScalarArray({n}),
+                                phi::DataType::INT32,
+                                &key);
+    DenseTensor key_out = Empty<int, Context>(dev_ctx, ScalarArray({n}));
+
+    DenseTensor range = Empty<T, Context>(dev_ctx, ScalarArray({n}));
+    T* range_data = range.data<T>();
+    funcs::ForRange<Context> for_range(dev_ctx, n);
+    for_range([range_data] __device__(size_t idx) {
+      range_data[idx] = static_cast<T>(idx);
+    });
+
+    out->Resize(phi::make_ddim({n}));
+    T* out_data = dev_ctx.template Alloc<T>(out);
+
+    // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to
+    // improve performance of radix sort.
+    double n_d = static_cast<double>(n);
+    int begin_bit = 0;
+    int end_bit =
+        std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9))));
+
+    size_t temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs<int, T>(nullptr,
+                                            temp_storage_bytes,
+                                            key.data<int>(),
+                                            key_out.data<int>(),
+                                            range.data<T>(),
+                                            out_data,
+                                            n,
+                                            begin_bit,
+                                            end_bit < 32 ? end_bit : 32,
+                                            dev_ctx.stream());
+
+    auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes);
+    cub::DeviceRadixSort::SortPairs<int, T>(d_temp_storage->ptr(),
+                                            temp_storage_bytes,
+                                            key.data<int>(),
+                                            key_out.data<int>(),
+                                            range.data<T>(),
+                                            out_data,
+                                            n,
+                                            begin_bit,
+                                            end_bit < 32 ? end_bit : 32,
+                                            dev_ctx.stream());
+
+    auto gen_cuda = dev_ctx.GetGenerator();
+    auto seed_offset = gen_cuda->IncrementOffset(n);
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
+    SwapRepeatKernel<T><<<config.block_per_grid.x,
+                          config.thread_per_block.x,
+                          0,
+                          dev_ctx.stream()>>>(
+        key_out.data<int>(), out_data, n, seed, offset);
   } else {
-    engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
-  }
+    DenseTensor tmp;
+    tmp.Resize(phi::make_ddim({n}));
+    T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp);
 
-  for (int i = 0; i < n; ++i) {
-    tmp_data[i] = static_cast<T>(i);
-  }
-  std::shuffle(tmp_data, tmp_data + n, *engine);
+    std::shared_ptr<std::mt19937_64> engine;
+    if (seed) {
+      engine = std::make_shared<std::mt19937_64>();
+      engine->seed(seed);
+    } else {
+      engine = dev_ctx.GetHostGenerator()->GetCPUEngine();
+    }
 
-  T* out_data = dev_ctx.template Alloc<T>(out);
-  auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
-  paddle::memory::Copy<phi::GPUPlace, phi::Place>(
-      out->place(), out_data, tmp.place(), tmp_data, size, 0);
+    for (int i = 0; i < n; ++i) {
+      tmp_data[i] = static_cast<T>(i);
+    }
+    std::shuffle(tmp_data, tmp_data + n, *engine);
+
+    T* out_data = dev_ctx.template Alloc<T>(out);
+    auto size = out->numel() * paddle::experimental::SizeOf(out->dtype());
+    paddle::memory::Copy<phi::GPUPlace, phi::Place>(
+        out->place(), out_data, tmp.place(), tmp_data, size, 0);
+  }
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index d21c8a3fa46f8..e32101b73728f 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -43,5 +43,59 @@ void ReduceGrad(const GPUContext& dev_ctx,
       }));
 }
 
+template <typename T,
+          typename Context,
+          template <typename, typename> class TransformOp>
+void ReduceGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DataType in_dtype,
+                      DataType out_dtype,
+                      DenseTensor* x_grad) {
+  auto* in_x = &x;
+  auto* d_out = &out_grad;
+  auto* d_x = x_grad;
+
+  auto pt_out_dtype = in_dtype;
+
+  // get reduce_dim and reduce_num for reduce_mean_grad
+  int dim_size = in_x->dims().size();
+  std::vector<int> reduce_dims =
+      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
+
+  auto update_dims = vectorize(d_x->dims());
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (in_x->dims())[i];
+    update_dims[i] = 1;
+  }
+  // make new tensor
+  DenseTensor new_d_out(d_out->dtype());
+  new_d_out.ShareDataWith(*d_out);
+  new_d_out.Resize(phi::make_ddim(update_dims));
+  if (in_dtype != DataType::UNDEFINED) {
+    dev_ctx.Alloc(d_x, in_dtype);
+  } else {
+    dev_ctx.Alloc(d_x, d_out->dtype());
+  }
+
+  auto pt_d_out = new_d_out;
+  auto pt_d_x = *d_x;
+  if (in_dtype == DataType::UNDEFINED) {
+    pt_out_dtype = d_out->dtype();
+  }
+  using MPType = typename kps::details::MPTypeTrait<T>::Type;
+
+  phi::ReduceGrad<T, TransformOp<T, MPType>>(
+      dev_ctx,
+      &pt_d_out,
+      &pt_d_x,
+      pt_out_dtype,
+      TransformOp<T, MPType>(reduce_num));
+}
+
 }  // namespace phi
 #endif
diff --git a/paddle/phi/kernels/gpu/reduce_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu
new file mode 100644
index 0000000000000..5256048267ea1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/reduce_grad.h"
+#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  ReduceGradKernel<T, Context, kps::IdentityFunctor>(dev_ctx,
+                                                     x,
+                                                     out_grad,
+                                                     dims,
+                                                     keep_dim,
+                                                     reduce_all,
+                                                     in_dtype,
+                                                     out_dtype,
+                                                     x_grad);
+}
+
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<T, Context, kps::DivideFunctor>(dev_ctx,
+                                                   x,
+                                                   out_grad,
+                                                   dims,
+                                                   keep_dim,
+                                                   reduce_all,
+                                                   in_dtype,
+                                                   out_dtype,
+                                                   x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceSumGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(mean_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMeanGradKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(prod_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(max_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(min_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu
new file mode 100644
index 0000000000000..6cbe699e8e058
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_kernel.cu
@@ -0,0 +1,158 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out) {
+  phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MaxFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MinFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::LogicalAndFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::LogicalOrFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(sum_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SumRawKernel,
+                   bool,
+                   float,
+                   double,
+                   float16,
+                   bfloat16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+
+PD_REGISTER_KERNEL(mean_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MeanRawKernel,
+                   float,
+                   double,
+                   bool,
+                   float16,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(prod_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ProdRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
+
+PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
index 98c3986c51dd6..ddbc08b06c84b 100644
--- a/paddle/phi/kernels/gpu/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
deleted file mode 100644
index 9f4ddc3cf37a7..0000000000000
--- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/reduce_function.h"
-#include "paddle/phi/kernels/gpu/reduce_grad.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void ReduceSumGradKernel(const Context& dev_ctx,
-                         const DenseTensor& x,
-                         const DenseTensor& out_grad,
-                         const std::vector<int64_t>& dims,
-                         bool keep_dim,
-                         bool reduce_all,
-                         DataType in_dtype,
-                         DataType out_dtype,
-                         DenseTensor* x_grad) {
-  auto* in_x = &x;
-  auto* d_out = &out_grad;
-  auto* d_x = x_grad;
-
-  auto pt_out_dtype = in_dtype;
-
-  // get reduce_dim and reduce_num for reduce_mean_grad
-  int dim_size = in_x->dims().size();
-  std::vector<int> reduce_dims =
-      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
-
-  auto update_dims = vectorize(d_x->dims());
-  int reduce_num = 1;
-  for (auto i : reduce_dims) {
-    reduce_num *= (in_x->dims())[i];
-    update_dims[i] = 1;
-  }
-  // make new tensor
-  DenseTensor new_d_out(d_out->dtype());
-  new_d_out.ShareDataWith(*d_out);
-  new_d_out.Resize(phi::make_ddim(update_dims));
-  if (in_dtype != DataType::UNDEFINED) {
-    dev_ctx.Alloc(d_x, in_dtype);
-  } else {
-    dev_ctx.Alloc(d_x, d_out->dtype());
-  }
-
-  auto pt_d_out = new_d_out;
-  auto pt_d_x = *d_x;
-  if (in_dtype == DataType::UNDEFINED) {
-    pt_out_dtype = d_out->dtype();
-  }
-  using MPType = typename kps::details::MPTypeTrait<T>::Type;
-
-  phi::ReduceGrad<T, kps::IdentityFunctor<T, MPType>>(
-      dev_ctx,
-      &pt_d_out,
-      &pt_d_x,
-      pt_out_dtype,
-      kps::IdentityFunctor<T, MPType>(reduce_num));
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sum_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::ReduceSumGradKernel,
-                   bool,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/reverse_kernel.cu.cc b/paddle/phi/kernels/gpu/reverse_kernel.cu.cc
new file mode 100644
index 0000000000000..f11eaa11bcdb1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reverse_kernel.cu.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reverse_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/reverse_kernel_impl.h"
+
+PD_REGISTER_KERNEL(reverse,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReverseKernel,
+                   int,
+                   uint8_t,
+                   int64_t,
+                   bool,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
new file mode 100644
index 0000000000000..cf076128b6939
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -0,0 +1,260 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 4;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <class T>
+__device__ void BilinearInterpolateGradient(const int height,
+                                            const int width,
+                                            T y,
+                                            T x,
+                                            T* w1,
+                                            T* w2,
+                                            T* w3,
+                                            T* w4,
+                                            int* x_low,
+                                            int* x_high,
+                                            int* y_low,
+                                            int* y_high) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return;
+  }
+
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  *y_low = static_cast<int>(y);
+  *x_low = static_cast<int>(x);
+  if (*y_low >= height - 1) {
+    *y_high = *y_low = height - 1;
+    y = static_cast<T>(*y_low);
+  } else {
+    *y_high = *y_low + 1;
+  }
+  if (*x_low >= width - 1) {
+    *x_high = *x_low = width - 1;
+    x = static_cast<T>(*x_low);
+  } else {
+    *x_high = *x_low + 1;
+  }
+  T ly = y - *y_low, lx = x - *x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+__global__ void GPURoiAlignBackward(const int nthreads,
+                                    const T* input_rois,
+                                    const T* out_grad,
+                                    const int num_rois,
+                                    const float spatial_scale,
+                                    const int channels,
+                                    const int height,
+                                    const int width,
+                                    const int pooled_height,
+                                    const int pooled_width,
+                                    const int sampling_ratio,
+                                    int* roi_batch_id_data,
+                                    T* input_grad,
+                                    const bool continuous_coordinate) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = roi_batch_id_data[n];
+
+    T roi_offset = continuous_coordinate ? T(0.5) : 0;
+    T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset;
+    T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset;
+    T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset;
+    T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_input_grad =
+        input_grad + (roi_batch_ind * channels + c) * height * width;
+
+    const T* offset_out_grad =
+        out_grad + (n * channels + c) * pooled_height * pooled_width;
+    const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw];
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_ymin + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_xmin + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
+        int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
+        BilinearInterpolateGradient(height,
+                                    width,
+                                    y,
+                                    x,
+                                    &w1,
+                                    &w2,
+                                    &w3,
+                                    &w4,
+                                    &x_low,
+                                    &x_high,
+                                    &y_low,
+                                    &y_high);
+        T diff1 = out_grad_this_bin * w1 / count;
+        T diff2 = out_grad_this_bin * w2 / count;
+        T diff3 = out_grad_this_bin * w3 / count;
+        T diff4 = out_grad_this_bin * w4 / count;
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_low * width + x_low, diff1);
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_low * width + x_high, diff2);
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_high * width + x_low, diff3);
+          paddle::platform::CudaAtomicAdd(
+              offset_input_grad + y_high * width + x_high, diff4);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx) {
+  int rois_num = boxes.dims()[0];
+  int channels = x.dims()[1];
+  int height = x.dims()[2];
+  int width = x.dims()[3];
+
+  if (!dx) {
+    return;
+  }
+
+  DenseTensor box_batch_id_list;
+  box_batch_id_list.Resize({rois_num});
+  int* box_batch_size = dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+
+  auto cplace = phi::CPUPlace();
+  auto gplace = dev_ctx.GetPlace();
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
+    std::vector<int> boxes_num_list(boxes_batch_size);
+    paddle::memory::Copy(cplace,
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = start; i < start + boxes_num_list[n]; ++i) {
+        box_batch_size[i] = n;
+      }
+      start += boxes_num_list[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_size[i] = n;
+      }
+    }
+  }
+  auto roi_ptr =
+      paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int));
+  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+  int bytes = box_batch_id_list.numel() * sizeof(int);
+  paddle::memory::Copy(
+      gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream());
+  dev_ctx.template Alloc<T>(dx);
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, dx, static_cast<T>(0));
+
+  int output_grad_size = out_grad.numel();
+  int blocks = NumBlocks(output_grad_size);
+  int threads = kNumCUDAThreads;
+
+  if (output_grad_size > 0) {
+    GPURoiAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        output_grad_size,
+        boxes.data<T>(),
+        out_grad.data<T>(),
+        rois_num,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        roi_id_data,
+        dx->data<T>(),
+        aligned);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_align_grad, GPU, ALL_LAYOUT, phi::RoiAlignGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu
index 2f906fa4f663b..cb3375dee95a5 100644
--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -18,7 +18,6 @@
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/empty_kernel.h"
 
 #include "paddle/fluid/memory/memory.h"
 
@@ -71,7 +70,7 @@ __device__ T BilinearInterpolate(
 }
 
 template <class T>
-__global__ void GPUROIAlignForward(const int nthreads,
+__global__ void GPURoiAlignForward(const int nthreads,
                                    const T* input_data,
                                    const T* input_rois,
                                    const float spatial_scale,
@@ -137,7 +136,7 @@ __global__ void GPUROIAlignForward(const int nthreads,
 }
 
 template <typename T, typename Context>
-void ROIAlignKernel(const Context& dev_ctx,
+void RoiAlignKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& boxes,
                     paddle::optional<const DenseTensor&> boxes_num,
@@ -233,7 +232,7 @@ void ROIAlignKernel(const Context& dev_ctx,
   int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
   paddle::memory::Copy(
       gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
-  GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+  GPURoiAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
       output_size,
       x.data<T>(),
       boxes.data<T>(),
@@ -252,4 +251,4 @@ void ROIAlignKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    roi_align, GPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double) {}
+    roi_align, GPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
new file mode 100644
index 0000000000000..d093a71d23f4e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPURoiPoolBackward(const int nthreads,
+                                   const T* input_rois,
+                                   const T* output_grad,
+                                   const int64_t* arg_max_data,
+                                   const int num_rois,
+                                   const float spatial_scale,
+                                   const int channels,
+                                   const int height,
+                                   const int width,
+                                   const int pooled_height,
+                                   const int pooled_width,
+                                   int* box_batch_id_data,
+                                   T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    int roi_batch_ind = box_batch_id_data[n];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_arg_max_data = arg_max_data + output_offset;
+
+    int arg_max = offset_arg_max_data[ph * pooled_width + pw];
+    if (arg_max != -1) {
+      paddle::platform::CudaAtomicAdd(
+          offset_input_grad + arg_max,
+          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiPoolGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& boxes,
+                       paddle::optional<const DenseTensor&> boxes_num,
+                       const DenseTensor& arg_max,
+                       const DenseTensor& out_grad,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       DenseTensor* dx) {
+  auto x_dims = x.dims();
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+  int rois_num = boxes.dims()[0];
+
+  if (dx) {
+    DenseTensor box_batch_id_list;
+    box_batch_id_list.Resize({rois_num});
+    int* box_batch_id_data =
+        dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+
+    auto gplace = dev_ctx.GetPlace();
+    if (boxes_num) {
+      int boxes_batch_size = boxes_num->numel();
+      std::vector<int> boxes_num_list(boxes_batch_size);
+      paddle::memory::Copy(phi::CPUPlace(),
+                           boxes_num_list.data(),
+                           gplace,
+                           boxes_num->data<int>(),
+                           sizeof(int) * boxes_batch_size,
+                           0);
+      int start = 0;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (int i = start; i < start + boxes_num_list[n]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+        start += boxes_num_list[n];
+      }
+    } else {
+      auto boxes_lod = boxes.lod().back();
+      int boxes_batch_size = boxes_lod.size() - 1;
+      for (int n = 0; n < boxes_batch_size; ++n) {
+        for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+          box_batch_id_data[i] = n;
+        }
+      }
+    }
+    int bytes = box_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    paddle::memory::Copy(gplace,
+                         roi_id_data,
+                         phi::CPUPlace(),
+                         box_batch_id_data,
+                         bytes,
+                         dev_ctx.stream());
+
+    dev_ctx.template Alloc<T>(dx);
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+
+    int output_grad_size = out_grad.numel();
+    int blocks = NumBlocks(output_grad_size);
+    int threads = kNumCUDAThreads;
+
+    if (output_grad_size > 0) {
+      GPURoiPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          output_grad_size,
+          boxes.data<T>(),
+          out_grad.data<T>(),
+          arg_max.data<int64_t>(),
+          rois_num,
+          spatial_scale,
+          channels,
+          height,
+          width,
+          pooled_height,
+          pooled_width,
+          roi_id_data,
+          dx->data<T>());
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool_grad, GPU, ALL_LAYOUT, phi::RoiPoolGradKernel, float, double) {
+  kernel->InputAt(3).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
new file mode 100644
index 0000000000000..ab33e2cf64751
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -0,0 +1,220 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_pool_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/memory/memory.h"
+
+namespace phi {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPURoiPoolForward(const int nthreads,
+                                  const T* input_data,
+                                  const T* input_rois,
+                                  const float spatial_scale,
+                                  const int channels,
+                                  const int height,
+                                  const int width,
+                                  const int pooled_height,
+                                  const int pooled_width,
+                                  int* box_batch_id_data,
+                                  T* output_data,
+                                  int64_t* arg_max_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
+
+    const T* offset_input_rois = input_rois + n * kROISize;
+    int box_batch_ind = box_batch_id_data[n];
+    int box_start_w = round(offset_input_rois[0] * spatial_scale);
+    int box_start_h = round(offset_input_rois[1] * spatial_scale);
+    int box_end_w = round(offset_input_rois[2] * spatial_scale);
+    int box_end_h = round(offset_input_rois[3] * spatial_scale);
+
+    int box_width = max(box_end_w - box_start_w + 1, 1);
+    int box_height = max(box_end_h - box_start_h + 1, 1);
+
+    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
+                                        static_cast<double>(box_height) /
+                                        static_cast<double>(pooled_height)));
+    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
+                                        static_cast<double>(box_width) /
+                                        static_cast<double>(pooled_width)));
+    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
+                                     static_cast<double>(box_height) /
+                                     static_cast<double>(pooled_height)));
+    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
+                                     static_cast<double>(box_width) /
+                                     static_cast<double>(pooled_width)));
+    hstart = min(max(hstart + box_start_h, 0), height);
+    hend = min(max(hend + box_start_h, 0), height);
+    wstart = min(max(wstart + box_start_w, 0), width);
+    wend = min(max(wend + box_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (box_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
+        }
+      }
+    }
+    output_data[i] = maxval;
+    if (arg_max_data) {
+      arg_max_data[i] = maxidx;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max) {
+  auto x_dims = x.dims();
+  int batch_size = x_dims[0];
+  auto in_stride = phi::stride(x_dims);
+  int channels = x_dims[1];
+  int height = x_dims[2];
+  int width = x_dims[3];
+
+  int rois_num = boxes.dims()[0];
+
+  if (rois_num == 0) return;
+
+  int output_size = out->numel();
+  int blocks = NumBlocks(output_size);
+  int threads = kNumCUDAThreads;
+
+  DenseTensor box_batch_id_list;
+  box_batch_id_list.Resize({rois_num});
+  int* box_batch_id_data = dev_ctx.template HostAlloc<int>(&box_batch_id_list);
+  auto gplace = dev_ctx.GetPlace();
+
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument(
+            "The batch size of input(ROIs) and input(X) must be the same but "
+            "received batch size of input(ROIs) and input(X) is %d and %d "
+            "respectively.",
+            boxes_batch_size,
+            batch_size));
+    std::vector<int> boxes_num_list(boxes_batch_size);
+    paddle::memory::Copy(phi::CPUPlace(),
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_list[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_list[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    int boxes_batch_size = boxes_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        boxes_batch_size,
+        batch_size,
+        phi::errors::InvalidArgument(
+            "The batch size of input(ROIs) and input(X) must be the same but "
+            "received batch size of input(ROIs) and input(X) is %d and %d "
+            "respectively.",
+            boxes_batch_size,
+            batch_size));
+
+    int boxes_num_with_lod = boxes_lod[boxes_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num,
+                      boxes_num_with_lod,
+                      phi::errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num,
+                          boxes_num_with_lod));
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+
+  int bytes = box_batch_id_list.numel() * sizeof(int);
+  auto box_ptr = paddle::memory::Alloc(dev_ctx, bytes);
+  int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
+  paddle::memory::Copy(gplace,
+                       box_id_data,
+                       phi::CPUPlace(),
+                       box_batch_id_data,
+                       bytes,
+                       dev_ctx.stream());
+
+  T* output_data = dev_ctx.template Alloc<T>(out);
+  int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);
+
+  GPURoiPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+      output_size,
+      x.data<T>(),
+      boxes.data<T>(),
+      spatial_scale,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      box_id_data,
+      output_data,
+      arg_max_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_pool, GPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
new file mode 100644
index 0000000000000..93e9e81882c9e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_grad_kernel.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad) {
+  auto* in_data = out_grad.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(x_grad);
+  int64_t numel = out_grad.numel();
+  auto stream = dev_ctx.stream();
+
+  auto shifts_data = shifts.GetData();
+  size_t nums = shifts_data.size();
+  auto input_dim = out_grad.dims();
+  auto stride_dim = phi::stride(input_dim);
+
+  std::vector<int64_t> strides(nums), sizes(nums);
+  if (axis.size() == 0) {
+    strides[0] = 1;
+    sizes[0] = numel;
+    shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel;
+  } else {
+    for (size_t i = 0; i < nums; i++) {
+      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      int64_t size = input_dim[dim];
+      if (size != 0) {
+        shifts_data[i] = ((-shifts_data[i]) % size + size) % size;
+        strides[i] = stride_dim[dim];
+        sizes[i] = size;
+      }
+    }
+  }
+
+  switch (nums) {
+    CALL_ROLL_CUDA_KERNEL(1);
+    CALL_ROLL_CUDA_KERNEL(2);
+    CALL_ROLL_CUDA_KERNEL(3);
+    CALL_ROLL_CUDA_KERNEL(4);
+    CALL_ROLL_CUDA_KERNEL(5);
+    CALL_ROLL_CUDA_KERNEL(6);
+    CALL_ROLL_CUDA_KERNEL(7);
+    CALL_ROLL_CUDA_KERNEL(8);
+    CALL_ROLL_CUDA_KERNEL(9);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "shifts.size() should be less than 10, But received shifts.size() "
+          "= %d",
+          shifts_data.size()));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RollGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu
new file mode 100644
index 0000000000000..1543335d3a0c5
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roll_kernel.h"
+
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out) {
+  auto* in_data = x.data<T>();
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  int64_t numel = x.numel();
+  auto stream = dev_ctx.stream();
+
+  auto shifts_data = shifts.GetData();
+
+  size_t nums = shifts_data.size();
+  auto input_dim = x.dims();
+  auto stride_dim = phi::stride(input_dim);
+
+  std::vector<int64_t> strides(nums), sizes(nums);
+  if (axis.size() == 0) {
+    strides[0] = 1;
+    sizes[0] = numel;
+    shifts_data[0] = (shifts_data[0] % numel + numel) % numel;
+  } else {
+    for (size_t i = 0; i < nums; i++) {
+      int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
+      int64_t size = input_dim[dim];
+
+      if (size != 0) {
+        shifts_data[i] = (shifts_data[i] % size + size) % size;
+        strides[i] = stride_dim[dim];
+        sizes[i] = size;
+      }
+    }
+  }
+
+  switch (nums) {
+    CALL_ROLL_CUDA_KERNEL(1);
+    CALL_ROLL_CUDA_KERNEL(2);
+    CALL_ROLL_CUDA_KERNEL(3);
+    CALL_ROLL_CUDA_KERNEL(4);
+    CALL_ROLL_CUDA_KERNEL(5);
+    CALL_ROLL_CUDA_KERNEL(6);
+    CALL_ROLL_CUDA_KERNEL(7);
+    CALL_ROLL_CUDA_KERNEL(8);
+    CALL_ROLL_CUDA_KERNEL(9);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "shifts.size() should be less than 10, But received shifts.size() "
+          "= %d",
+          shifts_data.size()));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roll,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RollKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
new file mode 100644
index 0000000000000..abe3ee470b4bc
--- /dev/null
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/core/utils/array.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, size_t Rank>
+__global__ void RollCudaKernel(const T* input,
+                               T* output,
+                               int64_t N,
+                               phi::Array<int64_t, Rank> shifts,
+                               phi::Array<int64_t, Rank> strides,
+                               phi::Array<int64_t, Rank> sizes) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int64_t output_idx = idx;
+  int64_t new_dim_idx = 0;
+
+#pragma unroll
+  for (size_t i = 0; i < Rank; i++) {
+    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
+    if (new_dim_idx >= sizes[i]) {
+      output_idx += (shifts[i] - sizes[i]) * strides[i];
+    } else {
+      output_idx += shifts[i] * strides[i];
+    }
+  }
+  output[output_idx] = input[idx];
+}
+
+#define CALL_ROLL_CUDA_KERNEL(N)                                              \
+  case N: {                                                                   \
+    phi::Array<int64_t, N> _strides;                                          \
+    phi::Array<int64_t, N> _shifts;                                           \
+    phi::Array<int64_t, N> _sizes;                                            \
+    for (size_t idx = 0; idx < N; ++idx) {                                    \
+      _strides[idx] = strides[idx];                                           \
+      _shifts[idx] = shifts_data[idx];                                        \
+      _sizes[idx] = sizes[idx];                                               \
+    }                                                                         \
+    RollCudaKernel<                                                           \
+        T,                                                                    \
+        N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
+             PADDLE_CUDA_NUM_THREADS,                                         \
+             0,                                                               \
+             stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes);  \
+    break;                                                                    \
+  }
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
index d9618dc159a6d..9d1769e18b4b8 100644
--- a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu
@@ -24,4 +24,6 @@ PD_REGISTER_KERNEL(segment_pool_grad,
                    ALL_LAYOUT,
                    phi::SegmentPoolGradKernel,
                    float,
-                   double) {}
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
index c38e935adf837..3128e534166ac 100644
--- a/paddle/phi/kernels/gpu/segment_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu
@@ -19,5 +19,11 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-PD_REGISTER_KERNEL(
-    segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {}
+PD_REGISTER_KERNEL(segment_pool,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SegmentPoolKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
similarity index 53%
rename from paddle/phi/kernels/gpu/reduce_prod_kernel.cu
rename to paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
index 278d4a6e5ab79..c5a243f45bd97 100644
--- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
+++ b/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
@@ -12,32 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_prod_kernel.h"
+#include "paddle/phi/kernels/squeeze_grad_kernel.h"
 
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h"
 
-namespace phi {
-
-template <typename T, typename Context>
-void ReduceProdKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const std::vector<int64_t>& dims,
-                      bool keep_dim,
-                      bool reduce_all,
-                      DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(reduce_prod,
+PD_REGISTER_KERNEL(squeeze_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::ReduceProdKernel,
+                   phi::SqueezeGradKernel,
                    float,
                    double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   bool,
                    int,
-                   int64_t) {}
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/squeeze_kernel.cu b/paddle/phi/kernels/gpu/squeeze_kernel.cu
new file mode 100644
index 0000000000000..ae15e210a02e7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/squeeze_kernel.cu
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/squeeze_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/squeeze_kernel_impl.h"
+
+PD_REGISTER_KERNEL(squeeze,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   bool,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
new file mode 100644
index 0000000000000..bc3ef1bc623bb
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/tril_triu_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
new file mode 100644
index 0000000000000..8c48edf9eff25
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tril_triu_kernel.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(tril_triu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TrilTriuKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu b/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
new file mode 100644
index 0000000000000..6c3a2066f0f2d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unsqueeze_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeGradKernel,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/unsqueeze_kernel.cu b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
new file mode 100644
index 0000000000000..86b4462254637
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unsqueeze_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unsqueeze,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu
index 535cb812a20ea..616679057ffce 100644
--- a/paddle/phi/kernels/gpu/where_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -20,150 +20,60 @@
 namespace cub = hipcub;
 #endif
 
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/where_index_kernel.h"
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
-
-template <typename T>
-__global__ void GetTrueNum(const T *cond_data,
-                           const int64_t numel,
-                           int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    true_num_array[idx] =
-        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
+template <typename T1, typename T2, typename OutT>
+struct IndexFunctor {
+  T2 stride[phi::DDim::kMaxRank];
+  int dims;
+  explicit IndexFunctor(const phi::DDim &in_dims) {
+    dims = in_dims.size();
+    std::vector<T2> strides_in_tmp;
+    strides_in_tmp.resize(dims, 1);
+    // get strides according to in_dims
+    for (T2 i = 1; i < dims; i++) {
+      strides_in_tmp[i] = strides_in_tmp[i - 1] * in_dims[dims - i];
+    }
+    memcpy(stride, strides_in_tmp.data(), dims * sizeof(T2));
   }
-}
 
-template <typename T>
-__global__ void SetTrueIndex(int64_t *out_ptr,
-                             const T *cond_data,
-                             const int64_t numel,
-                             const int64_t *stride_array,
-                             const int64_t rank,
-                             const int64_t *true_num_array) {
-  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
-    // true_num_array is calculated by cub::InclusiveSum,
-    // cause the first element of true_num_array is 1,
-    // so we need substract 1 to get true index.
-    const int64_t true_index = true_num_array[idx] - 1;
-    if (static_cast<bool>(cond_data[idx])) {
-      int64_t rank_index = idx;
-      for (int j = 0; j < rank; j++) {
-        const int64_t out_index = rank_index / stride_array[j];
-        out_ptr[true_index * rank + j] = out_index;
-        rank_index -= out_index * stride_array[j];
+  HOSTDEVICE inline void operator()(OutT *out,
+                                    const T1 *mask,
+                                    const T2 *index,
+                                    const int num) {
+    int store_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        T2 data_index = index[idx];
+        // get index
+        for (int rank_id = dims - 1; rank_id >= 0; --rank_id) {
+          out[store_fix] = static_cast<OutT>(data_index / stride[rank_id]);
+          data_index = data_index % stride[rank_id];
+          store_fix++;
+        }
       }
     }
   }
-}
+};
 
 template <typename T, typename Context>
 void WhereIndexKernel(const Context &dev_ctx,
                       const DenseTensor &condition,
                       DenseTensor *out) {
-  const T *cond_data = condition.data<T>();
-  const int64_t numel = condition.numel();
+  DenseTensor in_data;
   auto dims = condition.dims();
-  const int rank = dims.size();
-
-  auto d_array_mem =
-      paddle::memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
-  auto h_array_mem =
-      paddle::memory::Alloc(phi::CPUPlace(), (rank + 1) * sizeof(int64_t));
-
-  // "stride_array" is an array and len(stride_array)==rank,
-  // each element is the stride of each dimension -- the length from i to i+1.
-  int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
-  int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
-
-  // "true_num_array" is an array and len(stride_array)==numel,
-  // at the beginning,
-  // "true_num_array" will set 1 if condition[i] == true else 0,
-  // then it will be calculated by cub::InclusiveSum,
-  // so that we can get the true number before i as the out index
-  int64_t *d_true_num_array = d_stride_array + rank;
-
-  // the total_true_num is the total number of condition[i] == true
-  int64_t *h_total_true_num = h_stride_array + rank;
-
-  // alloce cub memory
-  size_t cub_size = 0;
-  cub::DeviceScan::InclusiveSum(nullptr,
-                                cub_size,
-                                d_true_num_array,
-                                d_true_num_array,
-                                numel,
-                                dev_ctx.stream());
-  auto cub_mem = paddle::memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
-  void *cub_data = cub_mem->ptr();
-
-  // set d_true_num_array[i]=1 if cond_data[i]==true else 0
-  const int threads = std::min(numel, static_cast<int64_t>(128));
-  const int64_t need_grids = (numel + threads - 1) / threads;
-  const int grids = std::min(need_grids, static_cast<int64_t>(256));
-  GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-      cond_data, numel, d_true_num_array);
-
-  // calculate the inclusive prefix sum of "true_num_array"
-  // to get the index of "out" tensor,
-  // and the total number of cond_data[i]==true.
-  // Example:
-  // condition: F T T F F F T T
-  // before:    0 1 1 0 0 0 1 1
-  // after:     0 1 2 2 2 2 3 4
-  // out:       1 2 6 7
-  cub::DeviceScan::InclusiveSum(cub_data,
-                                cub_size,
-                                d_true_num_array,
-                                d_true_num_array,
-                                numel,
-                                dev_ctx.stream());
-
-  // calculate each dimension's stride
-  h_stride_array[rank - 1] = 1;
-  for (int i = rank - 2; i >= 0; i--) {
-    h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
-  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       d_stride_array,
-                       phi::CPUPlace(),
-                       h_stride_array,
-                       rank * sizeof(int64_t),
-                       dev_ctx.stream());
-
-  // get total ture number and set output size
-  // the last element of cub::InclusiveSum is the total number
-  paddle::memory::Copy(phi::CPUPlace(),
-                       h_total_true_num,
-                       dev_ctx.GetPlace(),
-                       d_true_num_array + numel - 1,
-                       sizeof(int64_t),
-                       dev_ctx.stream());
-  dev_ctx.Wait();
-
-  int64_t true_num = *h_total_true_num;
-  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
-  auto *out_data = dev_ctx.template Alloc<int64_t>(out);
-
-  if (true_num == 0) {
-    return;
-  }
-
-  // using true_num_array and stride_array to calculate the output index
-  SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
-      out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
+  using Functor = IndexFunctor<T, int64_t, int64_t>;
+  Functor index_functor = Functor(dims);
+  phi::funcs::SelectKernel<T, T, int64_t, 0, Functor>(
+      dev_ctx, condition, in_data, out, index_functor);
 }
-
 }  // namespace phi
 
 PD_REGISTER_KERNEL(where_index,
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index b4a6fe337c8d2..9c5e77d5fd846 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -71,15 +71,15 @@ void ConvCudnnGradGradKernel(
   auto dW = filter_grad;
   auto dX = input_grad;
   if (ddO) {
-    ddO->mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(ddO);
     phi::funcs::SetConstant<Context, T> set_zero;
     set_zero(ctx, ddO, static_cast<T>(0));
   }
   if (dW) {
-    dW->mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(dW);
   }
   if (dX) {
-    dX->mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(dX);
   }
 
   // const T* x = X->data<T>();
@@ -131,7 +131,7 @@ void ConvCudnnGradGradKernel(
     }
     if (dX) {
       ResizeToChannelFirst<Context, T>(ctx, dX, &transformed_dX_channel);
-      transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
+      ctx.template Alloc<T>(&transformed_dX_channel);
     }
 
   } else {
@@ -186,13 +186,13 @@ void ConvCudnnGradGradKernel(
     transformed_ddX.Resize(new_input_shape);
     transformed_dX.Resize(new_input_shape);
 
-    transformed_X.mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(&transformed_X);
 
     if (ddX) {
-      transformed_ddX.mutable_data<T>(ctx.GetPlace());
+      ctx.template Alloc<T>(&transformed_ddX);
     }
     if (dX) {
-      transformed_dX.mutable_data<T>(ctx.GetPlace());
+      ctx.template Alloc<T>(&transformed_dX);
     }
 
     // pad for input
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 64148e902fdb2..a99a1e5f9471e 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -58,10 +58,10 @@ void ConvCudnnGradKernel(const Context& ctx,
                          DenseTensor* input_grad,
                          DenseTensor* filter_grad) {
   if (input_grad) {
-    input_grad->mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(input_grad);
   }
   if (filter_grad) {
-    filter_grad->mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(filter_grad);
   }
 
   std::vector<int> dilations = dilations_t;
@@ -204,12 +204,12 @@ void ConvCudnnGradKernel(const Context& ctx,
     }
     DDim new_input_shape(make_ddim(new_input_shape_vec));
     transformed_input.Resize(new_input_shape);
-    transformed_input.mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(&transformed_input);
 
     transformed_input_grad.Resize(new_input_shape);
 
     if (input_grad) {
-      transformed_input_grad.mutable_data<T>(ctx.GetPlace());
+      ctx.template Alloc<T>(&transformed_input_grad);
     }
     // pad for input
     const int rank = transformed_input_channel.dims().size();
@@ -427,7 +427,7 @@ void ConvCudnnGradKernel(const Context& ctx,
     if (use_addto) {
       DenseTensor temp_tensor(transformed_input_grad.type());
       temp_tensor.Resize(transformed_input_grad.dims());
-      T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
+      T* temp_tensor_data = ctx.template Alloc<T>(&temp_tensor);
       workspace_handle.RunFunc(
           [&](void* cudnn_workspace_ptr) {
             PADDLE_ENFORCE_GPU_SUCCESS(
@@ -513,7 +513,7 @@ void ConvCudnnGradKernel(const Context& ctx,
         axes[i] = i;
       }
 
-      transformed_input_grad_channel.mutable_data(ctx.GetPlace());
+      ctx.template Alloc<T>(&transformed_input_grad_channel);
       if (transformed_input_channel.dims().size() == 4) {
         paddle::operators::RemovePaddingSlice<Context, T, 4>(
             ctx,
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index 931b6d68845e2..c2970cc8cde75 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -54,7 +54,7 @@ void ConvCudnnKernel(const Context& ctx,
                      int workspace_size_MB,
                      bool exhaustive_search_t,
                      DenseTensor* output) {
-  output->mutable_data<T>(ctx.GetPlace());
+  ctx.template Alloc<T>(output);
   std::vector<int> paddings = paddings_t;
   std::vector<int> dilations = dilations_t;
 
@@ -170,7 +170,7 @@ void ConvCudnnKernel(const Context& ctx,
     }
     DDim new_input_shape(make_ddim(new_input_shape_vec));
     transformed_input.Resize(new_input_shape);
-    transformed_input.mutable_data<T>(ctx.GetPlace());
+    ctx.template Alloc<T>(&transformed_input);
 
     const int rank = transformed_input_channel.dims().size();
     T pad_value(0.0);
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
new file mode 100644
index 0000000000000..2893bd74b1bce
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -0,0 +1,1122 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
+#endif
+
+namespace phi {
+
+using GPUDNNDataLayout = paddle::platform::DataLayout;
+
+template <typename T, typename Context>
+void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
+                                      const DenseTensor& x,
+                                      const DenseTensor& filter,
+                                      const DenseTensor& dout,
+                                      const std::vector<int>& strides,
+                                      const std::vector<int>& paddings,
+                                      const std::string& padding_algorithm,
+                                      int groups,
+                                      const std::vector<int>& dilations,
+                                      const std::string& data_format,
+                                      DenseTensor* dx,
+                                      DenseTensor* dfilter) {
+  const T* filter_data = filter.data<T>();
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ =
+      dilations;  // cudnn v5 does not support dilations
+  const GPUDNNDataLayout data_layout =
+      (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW
+                             : GPUDNNDataLayout::kNHWC);
+
+  // if channel_last, transpose to channel_first
+  DenseTensor x_transpose;
+  DenseTensor dout_transpose;
+  std::vector<int> x_vec = vectorize<int>(x.dims());
+  std::vector<int> out_vec = vectorize<int>(dout.dims());
+  if (data_layout == GPUDNNDataLayout::kNHWC) {
+    if (strides.size() == 2U) {
+      std::vector<int> axis = {0, 3, 1, 2};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = dout.dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(ctx, x, axis);
+      dout_transpose = Transpose<T, Context>(ctx, dout, axis);
+    } else if (strides.size() == 3U) {
+      std::vector<int> axis = {0, 4, 1, 2, 3};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = dout.dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(ctx, x, axis);
+      dout_transpose = Transpose<T, Context>(ctx, dout, axis);
+    }
+  } else {
+    x_transpose = x;
+    dout_transpose = dout;
+  }
+
+  // update padding and dilation
+  auto x_dims = x_transpose.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims;
+  x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+
+  std::vector<int> x_pad(x_dims.size() * 2, 0);
+  DenseTensor transformed_dout;
+  std::vector<int> padding_common(data_dim, 0);
+  if (!is_sys_pad) {
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_dout_shape_vec(data_dim + 2);
+    new_dout_shape_vec[0] = dout_transpose.dims()[0];
+    new_dout_shape_vec[1] = dout_transpose.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_dout_shape_vec[i + 2] =
+          dout_transpose.dims()[i + 2] + padding_diff[i];
+      x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+
+    transformed_dout.Resize(make_ddim(new_dout_shape_vec));
+    ctx.template Alloc<T>(&transformed_dout);
+
+    const int rank = x_transpose.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor."));
+    }
+  } else {
+    transformed_dout = dout_transpose;
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  const T* x_data = x_transpose.data<T>();
+  const T* dout_data = transformed_dout.data<T>();
+  out_vec = vectorize<int>(transformed_dout.dims());
+
+  // ------------------- cudnn descriptors ---------------------
+  GPUDNNDataLayout layout;
+
+  if (strides.size() == 2U) {
+    layout = GPUDNNDataLayout::kNCHW;
+  } else {
+    layout = GPUDNNDataLayout::kNCDHW;
+  }
+
+  int iwo_groups = groups;
+  int c_groups = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_groups = 1;
+  c_groups = groups;
+  groups = 1;
+#endif
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+  paddle::operators::ConvArgs args1{&transformed_dout,
+                                    &filter,
+                                    &x_transpose,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_dout,
+                                    &filter,
+                                    &x_transpose,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+
+#ifdef PADDLE_WITH_HIP
+  miopenConvFwdAlgorithm_t data_algo{};
+  miopenConvBwdWeightsAlgorithm_t filter_algo{};
+#else
+  cudnnConvolutionFwdAlgo_t data_algo{};
+  cudnnConvolutionBwdFilterAlgo_t filter_algo{};
+#endif
+
+  auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
+  size_t workspace_size = 0;
+  auto handle = ctx.cudnn_handle();
+  bool deterministic = FLAGS_cudnn_deterministic;
+  T* dx_data = nullptr;
+  T* dfilter_data = nullptr;
+
+  if (dx) {
+    dx_data = ctx.template Alloc<T>(dx);
+    args1.handle = handle;
+    args1.idesc.set(transformed_dout, iwo_groups);
+    args1.wdesc.set(filter, layout_tensor, iwo_groups);
+    args1.odesc.set(x_transpose, iwo_groups);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search1 =
+        paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
+    data_algo =
+        search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+#else
+    using search1 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+    data_algo = search1::Find<T>(args1, false, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
+#endif
+  }
+
+  if (dfilter) {
+    dfilter_data = ctx.template Alloc<T>(dfilter);
+    args2.handle = handle;
+    args2.idesc.set(transformed_dout, iwo_groups);
+    args2.wdesc.set(*dfilter, layout_tensor, iwo_groups);
+    args2.odesc.set(x_transpose, iwo_groups);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search2 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
+    filter_algo =
+        search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+#else
+    using search2 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo));
+#endif
+  }
+
+  // ------------------- cudnn conv backward data ---------------------
+  // FIxME(typhoonzero): template type T may not be the same as cudnn call.
+  int x_offset = x.numel() / x.dims()[0] / groups;
+  int dout_offset =
+      transformed_dout.numel() / transformed_dout.dims()[0] / groups;
+  int filter_offset = filter.numel() / groups;
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+  if (dx) {
+    // Because beta is zero, it is unnecessary to reset dx.
+    for (int g = 0; g < groups; g++) {
+#ifdef PADDLE_WITH_HIP
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            dynload::miopenConvolutionForward(handle,
+                                              &alpha,
+                                              args1.idesc.desc(),
+                                              dout_data + dout_offset * g,
+                                              args1.wdesc.desc(),
+                                              filter_data + filter_offset * g,
+                                              args1.cdesc.desc(),
+                                              data_algo,
+                                              &beta,
+                                              args1.odesc.desc(),
+                                              dx_data + x_offset * g,
+                                              cudnn_workspace,
+                                              workspace_size));
+      };
+#else   // PADDLE_WITH_HIP
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            dynload::cudnnConvolutionForward(handle,
+                                             &alpha,
+                                             args1.idesc.desc(),
+                                             dout_data + dout_offset * g,
+                                             args1.wdesc.desc(),
+                                             filter_data + filter_offset * g,
+                                             args1.cdesc.desc(),
+                                             data_algo,
+                                             cudnn_workspace,
+                                             workspace_size,
+                                             &beta,
+                                             args1.odesc.desc(),
+                                             dx_data + x_offset * g));
+      };
+#endif  // PADDLE_WITH_HIP
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+    }
+
+    if (data_layout == GPUDNNDataLayout::kNHWC) {
+      DenseTensor dx_transpose;
+      DenseTensor dx_nchw;
+      dx_nchw.ShareDataWith(*dx);
+      dx_nchw.Resize(make_ddim(x_vec));
+      if (strides.size() == 2U) {
+        std::vector<int> axis = {0, 2, 3, 1};
+        dx_transpose = Transpose<T, Context>(ctx, dx_nchw, axis);
+        *dx = dx_transpose;
+      } else if (strides.size() == 3U) {
+        std::vector<int> axis = {0, 2, 3, 4, 1};
+        dx_transpose = Transpose<T, Context>(ctx, dx_nchw, axis);
+        *dx = dx_transpose;
+      }
+    }
+  }
+
+  // ------------------- cudnn conv backward filter ---------------------
+  if (dfilter) {
+    // Because beta is zero, it is unnecessary to reset dfilter.
+    // Gradient with respect to the filter
+    for (int g = 0; g < groups; g++) {
+#ifdef PADDLE_WITH_HIP
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights(
+            handle,
+            &alpha,
+            args2.odesc.desc(),
+            x_data + x_offset * g,
+            args2.idesc.desc(),
+            dout_data + dout_offset * g,
+            args2.cdesc.desc(),
+            filter_algo,
+            &beta,
+            args2.wdesc.desc(),
+            dfilter_data + filter_offset * g,
+            cudnn_workspace,
+            workspace_size));
+      };
+#else   // PADDLE_WITH_HIP
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
+            handle,
+            &alpha,
+            args2.idesc.desc(),
+            dout_data + dout_offset * g,
+            args2.odesc.desc(),
+            x_data + x_offset * g,
+            args2.cdesc.desc(),
+            filter_algo,
+            cudnn_workspace,
+            workspace_size,
+            &beta,
+            args2.wdesc.desc(),
+            dfilter_data + filter_offset * g));
+      };
+#endif  // PADDLE_WITH_HIP
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeGradGPUDNNKernel(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings_,
+                                     const std::vector<int>& output_padding,
+                                     const std::vector<int>& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations_,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter) {
+  ConvTransposeGradRawGPUDNNKernel<T, Context>(ctx,
+                                               x,
+                                               filter,
+                                               dout,
+                                               strides,
+                                               paddings_,
+                                               padding_algorithm,
+                                               groups,
+                                               dilations_,
+                                               data_format,
+                                               dx,
+                                               dfilter);
+}
+
+/*
+ * Inputs:  I, filter, dout, ddI, ddfilter
+ * Outputs: ddout, dfilter, dI
+ * ddo = conv_bp_data(filter, ddI) + conv_bp_data(ddfilter, I)
+ * dfilter = conv_bp_filter(dout, ddI)
+ * dI = conv(dout, ddfilter)
+ */
+template <typename T, typename Context>
+void Conv2dTransposeDoubleGradGPUDNNKernel(
+    const Context& ctx,
+    const DenseTensor& x,
+    const DenseTensor& filter,
+    const DenseTensor& dout,
+    const DenseTensor& ddx,
+    const DenseTensor& ddfilter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const std::vector<int>& output_padding,
+    const std::vector<int>& output_size,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations,
+    const std::string& data_format,
+    DenseTensor* dx,
+    DenseTensor* dfilter,
+    DenseTensor* ddout) {
+  if (dx) {
+    ctx.template Alloc<T>(dx);
+  }
+  if (dfilter) {
+    ctx.template Alloc<T>(dfilter);
+  }
+  if (ddout) {
+    ctx.template Alloc<T>(ddout);
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, ddout, static_cast<T>(0));
+  }
+
+  const T* filter_ = filter.data<T>();
+  const T* dout_ = dout.data<T>();
+  const T* ddx_ = nullptr;
+  const T* ddfilter_ = nullptr;
+  T* dx_ = nullptr;
+  T* dfilter_ = nullptr;
+  T* ddout_ = nullptr;
+  T* transformed_dx_ = nullptr;
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  bool deterministic = FLAGS_cudnn_deterministic;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform DenseTensors to channel first-----------
+  DenseTensor transformed_x_channel(x.type());
+  DenseTensor transformed_dout_channel(dout.type());
+  DenseTensor transformed_ddx_channel(x.type());
+
+  DenseTensor transformed_dx_channel(x.type());
+  DenseTensor transformed_ddout_channel(dout.type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x_channel);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x_channel);
+
+    ResizeToChannelFirst<Context, T>(ctx, &dout, &transformed_dout_channel);
+    TransToChannelFirst<Context, T>(ctx, &dout, &transformed_dout_channel);
+
+    ResizeToChannelFirst<Context, T>(ctx, &ddx, &transformed_ddx_channel);
+    TransToChannelFirst<Context, T>(ctx, &ddx, &transformed_ddx_channel);
+
+    if (dx) {
+      ResizeToChannelFirst<Context, T>(ctx, dx, &transformed_dx_channel);
+      ctx.template Alloc<T>(&transformed_dx_channel);
+    }
+    if (ddout) {
+      ResizeToChannelFirst<Context, T>(ctx, ddout, &transformed_ddout_channel);
+    }
+  } else {
+    transformed_x_channel = x;
+    transformed_dout_channel = dout;
+    transformed_ddx_channel = ddx;
+
+    if (dx) {
+      transformed_dx_channel = *dx;
+    }
+  }
+  std::vector<int> out_vec = vectorize<int>(transformed_dout_channel.dims());
+
+  auto x_dims = transformed_x_channel.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_ddx(x.type());
+
+  DenseTensor transformed_dout(dout.type());
+
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(x.dims().size() * 2, 0);
+
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    std::vector<int> new_output_grad_shape_vec(data_dim + 2);
+
+    new_input_shape_vec[0] = transformed_x_channel.dims()[0];
+    new_input_shape_vec[1] = transformed_x_channel.dims()[1];
+
+    new_output_grad_shape_vec[0] = transformed_dout_channel.dims()[0];
+    new_output_grad_shape_vec[1] = transformed_dout_channel.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_input_shape_vec[i + 2] =
+          transformed_x_channel.dims()[i + 2] + padding_diff[i];
+
+      new_output_grad_shape_vec[i + 2] =
+          transformed_dout_channel.dims()[i + 2] + padding_diff[i];
+
+      input_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      input_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_x.Resize(new_input_shape);
+    transformed_ddx.Resize(new_input_shape);
+    transformed_dout.Resize(make_ddim(new_output_grad_shape_vec));
+
+    ctx.template Alloc<T>(&transformed_x);
+    ctx.template Alloc<T>(&transformed_ddx);
+    ctx.template Alloc<T>(&transformed_dout);
+
+    // pad for input
+    const int rank = x.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            ctx, input_pad, transformed_x_channel, pad_value, &transformed_x);
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_dout_channel,
+                                          pad_value,
+                                          &transformed_dout);
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_ddx_channel,
+                                          pad_value,
+                                          &transformed_ddx);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            ctx, input_pad, transformed_x_channel, pad_value, &transformed_x);
+        funcs::PadFunction<Context, T, 5>(ctx,
+                                          input_pad,
+                                          transformed_ddx_channel,
+                                          pad_value,
+                                          &transformed_ddx);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+  } else {
+    transformed_x = transformed_x_channel;
+    transformed_dout = transformed_dout_channel;
+    transformed_ddx = transformed_ddx_channel;
+
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  std::vector<int64_t> starts(data_dim, 0);
+  std::vector<int64_t> ends(data_dim, 0);
+  std::vector<int64_t> axes(data_dim, 0);
+  for (size_t i = 0; i < data_dim; ++i) {
+    starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
+    ends[i] = starts[i] + out_vec[i + 2];
+    axes[i] = i + 2;
+  }
+
+  std::vector<int> transformed_out_vec = out_vec;
+  for (size_t i = 0; i < data_dim; ++i) {
+    transformed_out_vec[i + 2] =
+        out_vec[i + 2] +
+        (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
+        2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1];
+  }
+
+  if (!is_sys_pad) {
+    transformed_ddout_channel.Resize(make_ddim(transformed_out_vec));
+    ctx.template Alloc<T>(&transformed_ddout_channel);
+  } else {
+    ctx.template Alloc<T>(ddout);
+    transformed_ddout_channel = *ddout;
+    transformed_ddout_channel.Resize(make_ddim(transformed_out_vec));
+  }
+
+  const T* x_ = transformed_x.data<T>();
+
+  int iwo_group = groups;
+  int c_group = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_group = 1;
+  c_group = groups;
+  groups = 1;
+#endif
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+  auto handle = ctx.cudnn_handle();
+
+  paddle::operators::ConvArgs args1{&transformed_ddout_channel,
+                                    &filter,
+                                    &transformed_ddx,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_ddout_channel,
+                                    &ddfilter,
+                                    &transformed_x,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+
+  paddle::operators::ConvArgs args3{&transformed_dout,
+                                    dfilter,
+                                    &transformed_ddx_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+  paddle::operators::ConvArgs args4{&transformed_dout,
+                                    &ddfilter,
+                                    &transformed_dx_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    dtype};
+#ifdef PADDLE_WITH_HIP
+  miopenConvBwdDataAlgorithm_t bwd_algo1 =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvBwdDataAlgorithm_t bwd_algo2 =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvFwdAlgorithm_t data_algo = static_cast<miopenConvFwdAlgorithm_t>(0);
+  miopenConvBwdWeightsAlgorithm_t filter_algo =
+      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+#else
+  cudnnConvolutionBwdDataAlgo_t bwd_algo1 =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionBwdDataAlgo_t bwd_algo2 =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionFwdAlgo_t data_algo =
+      static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  cudnnConvolutionBwdFilterAlgo_t filter_algo =
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+#endif
+
+  auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
+
+  // ddo = conv(ddI, filter) + conv(I, ddfilter)
+  size_t workspace_size = 0;
+
+  T* transformed_ddout_channel_ = nullptr;
+
+  if (ddout) {
+    ddout_ = ddout->data<T>();
+    transformed_ddout_channel_ = transformed_ddout_channel.data<T>();
+
+    args1.handle = handle;
+    args1.idesc.set(transformed_ddout_channel, iwo_group);
+    args1.wdesc.set(filter, layout, iwo_group);
+    args1.odesc.set(transformed_ddx, iwo_group);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search1 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = search1::GetWorkspaceSize(args1);
+    bwd_algo1 =
+        search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+#else
+    using search1 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
+    workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
+#endif
+
+    ddfilter_ = ddfilter.data<T>();
+    args2.handle = handle;
+    args2.idesc.set(transformed_ddout_channel, iwo_group);
+    args2.wdesc.set(ddfilter, layout, iwo_group);
+    args2.odesc.set(transformed_x, iwo_group);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search2 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
+    bwd_algo2 =
+        search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+#else
+    using search2 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2));
+#endif
+  }
+
+  if (dfilter) {
+    dfilter_ = dfilter->data<T>();
+    args3.handle = handle;
+    args3.idesc.set(transformed_dout, iwo_group);
+    args3.wdesc.set(*dfilter, layout, iwo_group);
+
+    args3.odesc.set(transformed_ddx_channel, iwo_group);
+
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search3 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
+    filter_algo =
+        search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
+#else
+    using search3 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo));
+#endif
+  }
+
+  if (dx) {
+    transformed_dx_ = transformed_dx_channel.data<T>();
+
+    args4.handle = handle;
+    args4.idesc.set(transformed_dout, iwo_group);
+    args4.wdesc.set(ddfilter, layout, iwo_group);
+    args4.odesc.set(transformed_dx_channel, iwo_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search4 =
+        paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
+    data_algo =
+        search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
+#else
+    using search4 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+    data_algo = search4::Find<T>(args4, false, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+#endif
+  }
+
+  int i_n, i_c, i_d, i_h, i_w;
+  paddle::operators::GetNCDHW(transformed_x.dims(),
+                              GPUDNNDataLayout::kNCHW,
+                              &i_n,
+                              &i_c,
+                              &i_d,
+                              &i_h,
+                              &i_w);
+
+  int o_n, o_c, o_d, o_h, o_w;
+  paddle::operators::GetNCDHW(transformed_dout.dims(),
+                              GPUDNNDataLayout::kNCHW,
+                              &o_n,
+                              &o_c,
+                              &o_d,
+                              &o_h,
+                              &o_w);
+
+  int group_offset_in =
+      transformed_x.numel() / transformed_x.dims()[0] / groups;
+  int group_offset_out =
+      transformed_dout.numel() / transformed_dout.dims()[0] / groups;
+  int group_offset_filter = filter.numel() / groups;
+
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+
+  auto wkspace_handle = ctx.cudnn_workspace_handle();
+
+  if (ddout) {
+    ddx_ = transformed_ddx.data<T>();
+    for (int i = 0; i < groups; i++) {
+#ifdef PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+                handle,
+                &alpha,
+                args1.odesc.desc(),
+                ddx_ + i * group_offset_in,
+                args1.wdesc.desc(),
+                filter_ + i * group_offset_filter,
+                args1.cdesc.desc(),
+                bwd_algo1,
+                &beta,
+                args1.idesc.desc(),
+                transformed_ddout_channel_ + i * group_offset_out,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+#else   // PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
+                handle,
+                &alpha,
+                args1.wdesc.desc(),
+                filter_ + i * group_offset_filter,
+                args1.odesc.desc(),
+                ddx_ + i * group_offset_in,
+                args1.cdesc.desc(),
+                bwd_algo1,
+                workspace_ptr,
+                workspace_size,
+                &beta,
+                args1.idesc.desc(),
+                transformed_ddout_channel_ + i * group_offset_out));
+          },
+          workspace_size);
+#endif  // PADDLE_WITH_HIP
+    }
+
+    for (int i = 0; i < groups; i++) {
+#ifdef PADDLE_WITH_HIP
+      // MIOPEN ONLY support beta to be 0.0f
+      DenseTensor conv_x_ddfilter(dout.type());
+      conv_x_ddfilter.Resize(transformed_ddout_channel.dims());
+      T* conv_x_ddfilter_data = ctx.template Alloc<T>(&conv_x_ddfilter);
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+                handle,
+                &alpha,
+                args2.odesc.desc(),
+                x_ + i * group_offset_in,
+                args2.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args2.cdesc.desc(),
+                bwd_algo2,
+                &beta,
+                args2.idesc.desc(),
+                conv_x_ddfilter_data + i * group_offset_out,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenOpTensor(
+          handle,
+          miopenTensorOpAdd,
+          &alpha,
+          args2.idesc.desc(),
+          transformed_ddout_channel_ + i * group_offset_out,
+          &alpha,
+          args2.idesc.desc(),
+          conv_x_ddfilter_data + i * group_offset_out,
+          &beta,
+          args2.idesc.desc(),
+          transformed_ddout_channel_ + i * group_offset_out));
+#else   // PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
+                handle,
+                &alpha,
+                args2.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args2.odesc.desc(),
+                x_ + i * group_offset_in,
+                args2.cdesc.desc(),
+                bwd_algo2,
+                workspace_ptr,
+                workspace_size,
+                &alpha,
+                args2.idesc.desc(),
+                transformed_ddout_channel_ + i * group_offset_out));
+          },
+          workspace_size);
+#endif  // PADDLE_WITH_HIP
+    }
+
+    if ((!is_sys_pad) && (!channel_last)) {
+      if (strides.size() == 2U) {
+        funcs::Slice<Context, T, 4>(
+            ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
+      } else if (!is_sys_pad && strides.size() == 3U) {
+        funcs::Slice<Context, T, 5>(
+            ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
+      }
+    } else if ((!is_sys_pad) && (channel_last)) {
+      if (strides.size() == 2U) {
+        funcs::Slice<Context, T, 4>(ctx,
+                                    &transformed_ddout_channel,
+                                    &transformed_ddout_channel,
+                                    starts,
+                                    ends,
+                                    axes);
+      } else if (!is_sys_pad && strides.size() == 3U) {
+        funcs::Slice<Context, T, 5>(ctx,
+                                    &transformed_ddout_channel,
+                                    &transformed_ddout_channel,
+                                    starts,
+                                    ends,
+                                    axes);
+      }
+
+      TransToChannelLast<Context, T>(ctx, &transformed_ddout_channel, ddout);
+    }
+  }
+
+  T* transformed_dout_channel_ = transformed_dout.data<T>();
+  if (dfilter) {
+    ddx_ = transformed_ddx_channel.data<T>();
+    for (int i = 0; i < groups; i++) {
+#ifdef PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                dynload::miopenConvolutionBackwardWeights(
+                    handle,
+                    &alpha,
+                    args3.odesc.desc(),
+                    ddx_ + i * group_offset_in,
+                    args3.idesc.desc(),
+                    transformed_dout_channel_ + i * group_offset_out,
+                    args3.cdesc.desc(),
+                    filter_algo,
+                    &beta,
+                    args3.wdesc.desc(),
+                    dfilter_ + i * group_offset_filter,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+#else   // PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardFilter(
+                handle,
+                &alpha,
+                args3.idesc.desc(),
+                transformed_dout_channel_ + i * group_offset_out,
+                args3.odesc.desc(),
+                ddx_ + i * group_offset_in,
+                args3.cdesc.desc(),
+                filter_algo,
+                workspace_ptr,
+                workspace_size,
+                &beta,
+                args3.wdesc.desc(),
+                dfilter_ + i * group_offset_filter));
+          },
+          workspace_size);
+#endif  // PADDLE_WITH_HIP
+    }
+  }
+
+  if (dx) {
+    ddfilter_ = ddfilter.data<T>();
+    for (int i = 0; i < groups; i++) {
+#ifdef PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward(
+                handle,
+                &alpha,
+                args4.idesc.desc(),
+                transformed_dout_channel_ + i * group_offset_out,
+                args4.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args4.cdesc.desc(),
+                data_algo,
+                &beta,
+                args4.odesc.desc(),
+                transformed_dx_ + i * group_offset_in,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+#else   // PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionForward(
+                handle,
+                &alpha,
+                args4.idesc.desc(),
+                transformed_dout_channel_ + i * group_offset_out,
+                args4.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args4.cdesc.desc(),
+                data_algo,
+                workspace_ptr,
+                workspace_size,
+                &beta,
+                args4.odesc.desc(),
+                transformed_dx_ + i * group_offset_in));
+          },
+          workspace_size);
+#endif  // PADDLE_WITH_HIP
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(ctx, &transformed_dx_channel, dx);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv3dTransposeGradGPUDNNKernel(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings_,
+                                     const std::vector<int>& output_padding,
+                                     const std::vector<int>& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations_,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter) {
+  ConvTransposeGradRawGPUDNNKernel<T, Context>(ctx,
+                                               x,
+                                               filter,
+                                               dout,
+                                               strides,
+                                               paddings_,
+                                               padding_algorithm,
+                                               groups,
+                                               dilations_,
+                                               data_format,
+                                               dx,
+                                               dfilter);
+}
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(conv2d_transpose_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGradGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(conv2d_transpose_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeDoubleGradGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(conv3d_transpose_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGradGPUDNNKernel,
+                   float,
+                   float16) {}
+#else
+PD_REGISTER_KERNEL(conv2d_transpose_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(conv2d_transpose_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeDoubleGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(conv3d_transpose_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGradGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
new file mode 100644
index 0000000000000..5de2df4a70c88
--- /dev/null
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -0,0 +1,381 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
+
+#include <algorithm>
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
+#endif
+
+namespace phi {
+
+using GPUDNNDataLayout = paddle::platform::DataLayout;
+
+template <typename T, typename Context>
+void ConvTransposeRawGPUDNNKernel(const Context& ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& filter,
+                                  const std::vector<int>& strides,
+                                  const std::vector<int>& paddings,
+                                  const std::string& padding_algorithm,
+                                  int groups,
+                                  const std::vector<int>& dilations,
+                                  const std::string& data_format,
+                                  DenseTensor* out) {
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ =
+      dilations;  // cudnn v5 does not support dilations
+  const T* filter_data = filter.data<T>();
+  const GPUDNNDataLayout data_layout =
+      (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW
+                             : GPUDNNDataLayout::kNHWC);
+  std::vector<int> x_vec = vectorize<int>(x.dims());
+  std::vector<int> out_vec = vectorize<int>(out->dims());
+  // if channel_last, transpose to channel_first
+  DenseTensor x_transpose;
+  if (data_layout == GPUDNNDataLayout::kNHWC) {
+    if (strides.size() == 2U) {
+      std::vector<int> axis = {0, 3, 1, 2};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = out->dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(ctx, x, axis);
+    } else if (strides.size() == 3U) {
+      std::vector<int> axis = {0, 4, 1, 2, 3};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = out->dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(ctx, x, axis);
+    }
+  } else {
+    x_transpose = x;
+  }
+
+  // update padding and dilation
+  auto x_dims = x_transpose.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims;
+  x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+
+  std::vector<int> x_pad(x_dims.size() * 2, 0);
+  DenseTensor transformed_x;
+  std::vector<int> padding_common(data_dim, 0);
+  if (!is_sys_pad) {
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_x_shape_vec(data_dim + 2);
+    new_x_shape_vec[0] = x_dims[0];
+    new_x_shape_vec[1] = x_dims[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_x_shape_vec[i + 2] = x_dims[i + 2] + padding_diff[i];
+      x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+    DDim new_x_shape(make_ddim(new_x_shape_vec));
+    transformed_x.Resize(new_x_shape);
+    ctx.template Alloc<T>(&transformed_x);
+
+    const int rank = x_dims.size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            ctx, x_pad, x_transpose, pad_value, &transformed_x);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            ctx, x_pad, x_transpose, pad_value, &transformed_x);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor."));
+    }
+  } else {
+    transformed_x = x_transpose;
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  std::vector<int64_t> starts(data_dim, 0);
+  std::vector<int64_t> ends(data_dim, 0);
+  std::vector<int64_t> axes(data_dim, 0);
+  for (size_t i = 0; i < data_dim; ++i) {
+    starts[i] = x_pad[2 * i + 4] * (strides[i] + 1);
+    ends[i] = starts[i] + out_vec[i + 2];
+    axes[i] = i + 2;
+  }
+
+  const T* x_data = transformed_x.data<T>();
+  x_vec = vectorize<int>(transformed_x.dims());
+
+  std::vector<int> transformed_out_vec = out_vec;
+  for (size_t i = 0; i < data_dim; ++i) {
+    transformed_out_vec[i + 2] =
+        out_vec[i + 2] + (x_pad[2 * i + 4] + x_pad[2 * i + 5]) * strides[i] -
+        2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1];
+  }
+
+  DenseTensor transformed_out;
+  if (!is_sys_pad) {
+    transformed_out.Resize(make_ddim(transformed_out_vec));
+    ctx.template Alloc<T>(&transformed_out);
+  } else {
+    ctx.template Alloc<T>(out);
+    transformed_out.ShareDataWith(*out);
+    transformed_out.Resize(make_ddim(transformed_out_vec));
+  }
+  T* transformed_out_data = transformed_out.data<T>();
+
+  GPUDNNDataLayout layout;
+
+  int iwo_groups = groups;
+  int c_groups = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_groups = 1;
+  c_groups = groups;
+  groups = 1;
+#endif
+
+  if (strides.size() == 2U) {
+    layout = GPUDNNDataLayout::kNCHW;
+  } else {
+    layout = GPUDNNDataLayout::kNCDHW;
+  }
+
+  size_t workspace_size = 0;
+#ifdef PADDLE_WITH_HIP
+  miopenConvBwdDataAlgorithm_t algo{};
+#else
+  cudnnConvolutionBwdDataAlgo_t algo{};
+#endif
+  // ------------------- cudnn conv algorithm ---------------------
+  auto handle = ctx.cudnn_handle();
+  auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
+  bool deterministic = FLAGS_cudnn_deterministic;
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+  // ------------------- cudnn descriptors ---------------------
+  paddle::operators::ConvArgs args{&transformed_out,
+                                   &filter,
+                                   &transformed_x,
+                                   strides,
+                                   padding_common,
+                                   dilations_,
+                                   dtype};
+  args.handle = handle;
+  args.idesc.set(transformed_out, iwo_groups);
+  args.wdesc.set(filter, layout_tensor, iwo_groups);
+  args.odesc.set(transformed_x, iwo_groups);
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations_,
+                 paddle::platform::AllowTF32Cudnn(),
+                 c_groups);
+
+#ifdef PADDLE_WITH_HIP
+  using search =
+      paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+  workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
+  algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
+#else
+  using search =
+      paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+  algo = search::Find<T>(args, false, deterministic, ctx);
+  workspace_size =
+      std::max(workspace_size, search::GetWorkspaceSize(args, algo));
+#endif
+
+  // ------------------- cudnn conv transpose forward ---------------------
+  int x_offset = transformed_x.numel() / transformed_x.dims()[0] / groups;
+  int out_offset = transformed_out.numel() / transformed_out.dims()[0] / groups;
+  int filter_offset = filter.numel() / groups;
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+  for (int g = 0; g < groups; g++) {
+#ifdef PADDLE_WITH_HIP
+    auto cudnn_func = [&](void* cudnn_workspace) {
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+          handle,
+          &alpha,
+          args.odesc.desc(),
+          x_data + x_offset * g,
+          args.wdesc.desc(),
+          filter_data + filter_offset * g,
+          args.cdesc.desc(),
+          algo,
+          &beta,
+          args.idesc.desc(),
+          transformed_out_data + out_offset * g,
+          cudnn_workspace,
+          workspace_size));
+    };
+#else   // PADDLE_WITH_HIP
+    auto cudnn_func = [&](void* cudnn_workspace) {
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnConvolutionBackwardData(
+          handle,
+          &alpha,
+          args.wdesc.desc(),
+          filter_data + filter_offset * g,
+          args.odesc.desc(),
+          x_data + x_offset * g,
+          args.cdesc.desc(),
+          algo,
+          cudnn_workspace,
+          workspace_size,
+          &beta,
+          args.idesc.desc(),
+          transformed_out_data + out_offset * g));
+    };
+#endif  // PADDLE_WITH_HIP
+    workspace_handle.RunFunc(cudnn_func, workspace_size);
+  }
+  if (!is_sys_pad && strides.size() == 2U) {
+    funcs::Slice<Context, T, 4>(ctx, &transformed_out, out, starts, ends, axes);
+  } else if (!is_sys_pad && strides.size() == 3U) {
+    funcs::Slice<Context, T, 5>(ctx, &transformed_out, out, starts, ends, axes);
+  }
+
+  if (data_layout == GPUDNNDataLayout::kNHWC) {
+    DenseTensor out_transpose;
+    DenseTensor out_nchw;
+    out_nchw.ShareDataWith(*out);
+    out_nchw.Resize(make_ddim(out_vec));
+
+    if (strides.size() == 2U) {
+      out_transpose = Transpose<T, Context>(ctx, out_nchw, {0, 2, 3, 1});
+    } else if (strides.size() == 3U) {
+      out_transpose = Transpose<T, Context>(ctx, out_nchw, {0, 2, 3, 4, 1});
+    }
+    *out = out_transpose;
+  }
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeGPUDNNKernel(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& filter,
+                                 const std::vector<int>& strides,
+                                 const std::vector<int>& paddings,
+                                 const std::vector<int>& output_padding,
+                                 const std::vector<int>& output_size,
+                                 const std::string& padding_algorithm,
+                                 int groups,
+                                 const std::vector<int>& dilations,
+                                 const std::string& data_format,
+                                 DenseTensor* out) {
+  ConvTransposeRawGPUDNNKernel<T, Context>(ctx,
+                                           x,
+                                           filter,
+                                           strides,
+                                           paddings,
+                                           padding_algorithm,
+                                           groups,
+                                           dilations,
+                                           data_format,
+                                           out);
+}
+
+template <typename T, typename Context>
+void Conv3dTransposeGPUDNNKernel(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& filter,
+                                 const std::vector<int>& strides,
+                                 const std::vector<int>& paddings,
+                                 const std::vector<int>& output_padding,
+                                 const std::vector<int>& output_size,
+                                 const std::string& padding_algorithm,
+                                 int groups,
+                                 const std::vector<int>& dilations,
+                                 const std::string& data_format,
+                                 DenseTensor* out) {
+  ConvTransposeRawGPUDNNKernel<T, Context>(ctx,
+                                           x,
+                                           filter,
+                                           strides,
+                                           paddings,
+                                           padding_algorithm,
+                                           groups,
+                                           dilations,
+                                           data_format,
+                                           out);
+}
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(conv2d_transpose,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGPUDNNKernel,
+                   float,
+                   float16) {}
+PD_REGISTER_KERNEL(conv3d_transpose,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGPUDNNKernel,
+                   float,
+                   float16) {}
+#else
+PD_REGISTER_KERNEL(conv2d_transpose,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv2dTransposeGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+PD_REGISTER_KERNEL(conv3d_transpose,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3dTransposeGPUDNNKernel,
+                   float,
+                   double,
+                   float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 2b2dd5118969c..77159bfc876da 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -121,17 +121,10 @@ struct ReduceMaxFunctor {
 };
 
 template <typename Tx, typename Ty = Tx>
-struct ExpSubFunctor {
-  HOSTDEVICE inline ExpSubFunctor() { y = static_cast<Tx>(0.0f); }
-
-  HOSTDEVICE explicit inline ExpSubFunctor(Tx y) : y((Tx)(y)) {}
-
+struct ExpFunctor {
   HOSTDEVICE inline Ty operator()(const Tx& x) const {
-    return static_cast<Ty>(std::exp(x - y));
+    return static_cast<Ty>(std::exp(x));
   }
-
- private:
-  Tx y;
 };
 
 template <typename Tx, typename Ty = Tx>
@@ -293,10 +286,14 @@ __global__ void WarpSoftmaxForward(T* softmax,
   }
 
   // data src
-  AccT srcdata[kBatchSize][kLoopsV][kVSize];
-  T src_tmp[kBatchSize][kLoopsV][kVSize];
-  kps::Init<AccT, kStep>(&srcdata[0][0][0], kLowInf);
-  kps::Init<T, kStep>(&src_tmp[0][0][0], -std::numeric_limits<T>::infinity());
+  // src_data: the raw data form global memory
+  // sub_data: store the data obtained by (src_data - max), used by log_softmax
+  // exp_data: store the data obtained by (exp(sub_data)), used by softmax
+  T src_data[kBatchSize][kLoopsV][kVSize];
+  AccT sub_data[kBatchSize][kLoopsV][kVSize];
+  AccT exp_data[kBatchSize][kLoopsV][kVSize];
+  kps::Init<AccT, kStep>(&sub_data[0][0][0], kLowInf);
+  kps::Init<T, kStep>(&src_data[0][0][0], -std::numeric_limits<T>::infinity());
 
   // data dst
   T out_tmp[kBatchSize][kLoopsV][kVSize];
@@ -313,11 +310,11 @@ __global__ void WarpSoftmaxForward(T* softmax,
   for (int i = 0; i < kBatchSize; ++i) {
     const VecT* src_v =
         reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
-    VecT* reg_v = reinterpret_cast<VecT*>(&src_tmp[i][0][0]);
+    VecT* reg_v = reinterpret_cast<VecT*>(&src_data[i][0][0]);
     kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
         &reg_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1);
     kps::ElementwiseUnary<T, AccT, kVItem, 1, 1, DataTransFunctor<T, AccT>>(
-        &srcdata[i][0][0], &src_tmp[i][0][0], DataTransFunctor<T, AccT>());
+        &sub_data[i][0][0], &src_data[i][0][0], DataTransFunctor<T, AccT>());
   }
 
   // compute max
@@ -327,14 +324,16 @@ __global__ void WarpSoftmaxForward(T* softmax,
               1,
               ReduceMaxFunctor<AccT>,
               kMode::kLocalMode>(
-      &max[0], &srcdata[0][0][0], ReduceMaxFunctor<AccT>(), true);
+      &max[0], &sub_data[0][0][0], ReduceMaxFunctor<AccT>(), true);
   WarpReduceMax<AccT, kBatchSize, kWarpSize>(max);
 
 // compute sum
 #pragma unroll
   for (int i = 0; i < kBatchSize; ++i) {
-    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpSubFunctor<AccT>>(
-        &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor<AccT>(max[i]));
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+        &sub_data[i][0][0], &sub_data[i][0][0], UnarySubFunctor<AccT>(max[i]));
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpFunctor<AccT>>(
+        &exp_data[i][0][0], &sub_data[i][0][0], ExpFunctor<AccT>());
   }
   kps::Reduce<AccT,
               kVItem,
@@ -342,7 +341,7 @@ __global__ void WarpSoftmaxForward(T* softmax,
               1,
               kps::AddFunctor<AccT>,
               kMode::kLocalMode>(
-      &sum[0], &srcdata[0][0][0], kps::AddFunctor<AccT>(), true);
+      &sum[0], &exp_data[0][0][0], kps::AddFunctor<AccT>(), true);
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
 // write data to global memory
@@ -352,15 +351,13 @@ __global__ void WarpSoftmaxForward(T* softmax,
         reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
     VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
     if (LogMode) {
-      kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnaryLogFunctor<AccT>>(
-          &srcdata[i][0][0], &srcdata[i][0][0], UnaryLogFunctor<AccT>());
       kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnarySubFunctor<AccT>>(
           &out_tmp[i][0][0],
-          &srcdata[i][0][0],
+          &sub_data[i][0][0],
           UnarySubFunctor<AccT>(std::log(sum[i])));
     } else {
       kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
-          &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
+          &out_tmp[i][0][0], &exp_data[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
     }
     kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
         &softmax_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
index d163e6e278a07..3694c8f1e6c99 100644
--- a/paddle/phi/kernels/graph_send_recv_grad_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
@@ -23,7 +23,7 @@ namespace phi {
 template <typename T, typename Context>
 void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& out_grad,
-                             paddle::optional<const DenseTensor&> x,
+                             const DenseTensor& x,
                              paddle::optional<const DenseTensor&> out,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h
index 95dbdc4443ad0..51768fbc18f01 100644
--- a/paddle/phi/kernels/graph_send_recv_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_kernel.h
@@ -25,6 +25,7 @@ void GraphSendRecvKernel(const Context& ctx,
                          const DenseTensor& src_index,
                          const DenseTensor& dst_index,
                          const std::string& pool_type,
+                         int64_t out_size,
                          DenseTensor* out,
                          DenseTensor* dst_count);
 
diff --git a/paddle/phi/kernels/grid_sample_grad_kernel.h b/paddle/phi/kernels/grid_sample_grad_kernel.h
new file mode 100644
index 0000000000000..50a8d5be260bd
--- /dev/null
+++ b/paddle/phi/kernels/grid_sample_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context &dev_ctx,
+                          const DenseTensor &x,
+                          const DenseTensor &grid,
+                          const DenseTensor &out_grid,
+                          const std::string &mode,
+                          const std::string &padding_mode,
+                          bool align_corners,
+                          DenseTensor *x_grad,
+                          DenseTensor *grid_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/grid_sample_kernel.h b/paddle/phi/kernels/grid_sample_kernel.h
new file mode 100644
index 0000000000000..2e1e9b508649b
--- /dev/null
+++ b/paddle/phi/kernels/grid_sample_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context &dev_ctx,
+                      const DenseTensor &x,
+                      const DenseTensor &grid,
+                      const std::string &mode,
+                      const std::string &padding_mode,
+                      bool align_corners,
+                      DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
new file mode 100644
index 0000000000000..f7a327cd3f566
--- /dev/null
+++ b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   DenseTensor* w_grad,
+                                   DenseTensor* bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
new file mode 100644
index 0000000000000..619b022904b17
--- /dev/null
+++ b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& w,
+                               const DenseTensor& label,
+                               paddle::optional<const DenseTensor&> path,
+                               paddle::optional<const DenseTensor&> code,
+                               paddle::optional<const DenseTensor&> bias,
+                               int num_classes,
+                               bool remote_prefetch,
+                               int trainer_id,
+                               const std::vector<int64_t>& height_sections,
+                               const std::vector<std::string>& epmap,
+                               const std::vector<std::string>& table_names,
+                               bool is_sparse,
+                               DenseTensor* out,
+                               DenseTensor* pre_out,
+                               DenseTensor* w_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index 80e23d2b8e24b..7ef8a0887c75c 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -130,4 +130,167 @@ void ReluDoubleGradKernel(const Context& dev_ctx,
       relu_double_grad_functor);
 }
 
+template <typename T, typename Context>
+void LeakyReluDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& ddx,
+                               float alpha,
+                               DenseTensor* ddout) {
+  funcs::LeakyReluGradGradFunctor<T> leaky_relu_double_grad_functor;
+  leaky_relu_double_grad_functor.alpha = alpha;
+  ActivationDoubleGradImpl<T, Context, funcs::LeakyReluGradGradFunctor<T>>(
+      dev_ctx,
+      &x,
+      nullptr,
+      &ddx,
+      nullptr,
+      nullptr,
+      ddout,
+      leaky_relu_double_grad_functor);
+}
+
+template <typename T, typename Context>
+void TanhDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          DenseTensor* dout_new,
+                          DenseTensor* ddout) {
+  if (dout_new) {
+    dout_new->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout_new);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::TanhGradGradFunctor<T> functor;
+  functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout);
+}
+
+template <typename T, typename Context>
+void TanhTripleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& ddx,
+                          const DenseTensor& dout,
+                          const DenseTensor& d_ddout,
+                          const DenseTensor& d_dout_new,
+                          DenseTensor* d_out_new,
+                          DenseTensor* d_dout,
+                          DenseTensor* d_ddx) {
+  if (d_dout) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_out_new) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_out_new);
+  }
+  if (d_ddx) {
+    d_dout->Resize(ddx.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  funcs::TanhTripleGradFunctor<T> functor;
+  functor(dev_ctx,
+          &out,
+          &ddx,
+          &dout,
+          &d_ddout,
+          &d_dout_new,  // input
+          d_dout,
+          d_out_new,
+          d_ddx);  // output
+}
+
+template <typename T, typename Context>
+void EluDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         float alpha,
+                         DenseTensor* dx,
+                         DenseTensor* ddout) {
+  if (dx) {
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::ELUGradGradFunctor<T> functor;
+  functor.alpha = alpha;
+  functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
+}
+
+template <typename T, typename Context>
+void SigmoidDoubleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             DenseTensor* dout_new,
+                             DenseTensor* ddout) {
+  if (dout_new) {
+    dout_new->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout_new);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::SigmoidGradGradFunctor<T> functor;
+  functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout);
+}
+
+template <typename T, typename Context>
+void SigmoidTripleGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out,
+                             const DenseTensor& ddx,
+                             const DenseTensor& dout,
+                             const DenseTensor& d_ddout,
+                             const DenseTensor& d_dout_new,
+                             DenseTensor* d_out_new,
+                             DenseTensor* d_dout,
+                             DenseTensor* d_ddx) {
+  if (d_dout) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_dout);
+  }
+  if (d_out_new) {
+    d_dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(d_out_new);
+  }
+  if (d_ddx) {
+    d_dout->Resize(ddx.dims());
+    dev_ctx.template Alloc<T>(d_ddx);
+  }
+  funcs::SigmoidTripleGradFunctor<T> functor;
+  functor(dev_ctx,
+          &out,
+          &ddx,
+          &dout,
+          &d_ddout,
+          &d_dout_new,
+          d_dout,
+          d_out_new,
+          d_ddx);
+}
+
+template <typename T, typename Context>
+void LogDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         DenseTensor* dx,
+                         DenseTensor* ddout) {
+  if (dx) {
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::LogGradGradFunctor<T> functor;
+  functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
index 9f557e7463789..e3ea10705d24e 100644
--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -19,18 +19,17 @@
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/matrix_reduce.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/tril_triu_op.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -115,7 +114,7 @@ void CholeskySolveGradKernel(const Context& dev_ctx,
   const auto H = y_bst_dims_vec[y_bst_ndim - 2];
   const auto W = y_bst_dims_vec[y_bst_ndim - 1];
   phi::funcs::ForRange<Context> y_for_range(dev_ctx, dy_bst.numel());
-  paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+  phi::funcs::TrilTriuCompute<T> tril_triu_functor(
       dy_bst.data<T>(), 0, !upper, H, W, dy_bst_upper.data<T>());
   y_for_range(tril_triu_functor);
 
diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
index fbcebf371a61b..bc0ed44e17a33 100644
--- a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
@@ -129,7 +129,7 @@ void ConvGradGradKernel(const Context& dev_ctx,
   DenseTensor col_matrix;
   if (is_expand) {
     col.Resize(col_shape);
-    col.mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(&col);
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }
@@ -143,7 +143,7 @@ void ConvGradGradKernel(const Context& dev_ctx,
   if (dX && ddW_in) {
     Tensor ddW;
     ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-    dX->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(dX);
 
     DenseTensor transformed_dX(dX->type());
 
@@ -201,7 +201,7 @@ void ConvGradGradKernel(const Context& dev_ctx,
   // oH, oW)
   // dw convolution double grad:  im2col(vol2col) + gemm
   if (dW && ddX) {
-    dW->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(dW);
     set_zero(dev_ctx, dW, static_cast<T>(0));
     DenseTensor dW_arr = *dW;
     dW_arr.Resize(filter_matrix_shape);
@@ -244,7 +244,7 @@ void ConvGradGradKernel(const Context& dev_ctx,
   // w/ddw(Cout, Cin, kh, kw)
   // ddy convolution double grad: im2col(vol2col) + gemm
   if (ddY) {
-    ddY->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(ddY);
 
     DenseTensor transformed_ddY(ddY->type());
     if (channel_last) {
diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
index f1971aca800b5..2deebb996a057 100644
--- a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
@@ -128,7 +128,7 @@ void ConvGradKernel(const Context& dev_ctx,
   DenseTensor col_matrix;
   if (is_expand) {
     col.Resize(col_shape);
-    col.mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(&col);
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }
@@ -137,7 +137,7 @@ void ConvGradKernel(const Context& dev_ctx,
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
 
   if (input_grad) {
-    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(input_grad);
     DenseTensor transformed_input_grad(input_grad->type());
     if (channel_last) {
       ResizeToChannelFirst<Context, T>(
@@ -203,7 +203,7 @@ void ConvGradKernel(const Context& dev_ctx,
   }
 
   if (filter_grad) {
-    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(filter_grad);
     Tensor filter_grad_ = *filter_grad;
     filter_grad_.Resize(filter_matrix_shape);
     set_zero(dev_ctx, filter_grad, static_cast<T>(0));
diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h
index 1945468f02551..2ef2ed8af2809 100644
--- a/paddle/phi/kernels/impl/conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_kernel_impl.h
@@ -44,7 +44,7 @@ void ConvKernel(const Context& dev_ctx,
   // The filter will be reshaped in the calculations,
   // so here use an assignment operation,
   // that avoids modifying the variable in the Scope.
-  output->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(output);
 
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
 
@@ -115,7 +115,7 @@ void ConvKernel(const Context& dev_ctx,
   if (is_expand) {
     // col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
     col.Resize(col_shape);
-    col.mutable_data<T>(dev_ctx.GetPlace());
+    dev_ctx.template Alloc<T>(&col);
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }
diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
new file mode 100644
index 0000000000000..d4fd952a67001
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -0,0 +1,364 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvTransposeGradRawKernel(const Context& ctx,
+                                const DenseTensor& x,
+                                const DenseTensor& filter,
+                                const DenseTensor& dout,
+                                const std::vector<int>& strides,
+                                const std::vector<int>& paddings,
+                                const std::string& padding_algorithm,
+                                int groups,
+                                const std::vector<int>& dilations,
+                                const std::string& data_format,
+                                DenseTensor* dx,
+                                DenseTensor* dfilter) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  // For filter, we do not use const pointer because we will do reshape,
+  // but we should avoid modifying its value.
+  DenseTensor filter_ = filter;
+
+  if ((!dx) && (!dfilter)) {
+    return;
+  }
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  auto x_dims = x.dims();
+  auto filter_dims = filter_.dims();
+  auto dout_dims = dout.dims();
+  const int batch_size = static_cast<int>(x.dims()[0]);
+
+  DDim in_data_dims;
+  if (data_layout != DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  } else {
+    in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
+
+  // x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
+  // x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
+  std::vector<int64_t> x_shape_vec = vectorize(x.dims());
+  // filter_shape_vec: {i_c, o_c, k_h, k_w} or {i_c, o_c, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec = vectorize(filter_.dims());
+
+  // use col_shape in the im2col and col2im (or vol2col and col2vol)
+  // calculation
+  // col_shape_vec: {o_c, k_h, k_w, h, w} or {o_c, k_d, k_h, k_w, d, h, w} for
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  if (data_layout != DataLayout::kNHWC) {
+    col_shape_vec[0] = dout_dims[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 2];
+    }
+  } else {
+    col_shape_vec[0] = dout_dims[dout_dims.size() - 1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1];
+    }
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+
+  // use col_matrix_shape in the gemm calculation
+  // size: (o_c * k_h * k_w, h * w) or (o_c * k_d * k_h * k_w, d * h * w)
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+
+  // output size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
+  // output size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
+  DDim output_shape = slice_ddim(dout.dims(), 1, dout.dims().size());
+
+  // x matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
+  // x matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
+  DDim x_matrix_shape;
+  if (data_layout != DataLayout::kNHWC) {
+    x_matrix_shape = {x_dims[1], col_matrix_shape[1]};
+  } else {
+    x_matrix_shape = {col_matrix_shape[1], x_dims[x_dims.size() - 1]};
+  }
+
+  // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
+  DDim filter_matrix_shape;
+  if (data_layout != DataLayout::kNHWC) {
+    filter_matrix_shape = {x_dims[1], col_matrix_shape[0] / groups};
+  } else {
+    filter_matrix_shape = {x_dims[x_dims.size() - 1],
+                           col_matrix_shape[0] / groups};
+  }
+  filter_.Resize(filter_matrix_shape);
+
+  int in_step = (data_layout != DataLayout::kNHWC
+                     ? static_cast<int>(x_dims[1]) / groups
+                     : static_cast<int>(x_dims[x_dims.size() - 1]) / groups);
+  int col_step = static_cast<int>(col_matrix_shape[0]) / groups;
+
+  // convolution transpose grad on x:
+  // im2col + gemm (similar to conv-forward)
+  // x need to compute gradient
+  auto blas = funcs::GetBlas<Context, T>(ctx);
+  if (dx || dfilter) {
+    DenseTensor col;
+    col.Resize(col_shape);
+    ctx.template Alloc<T>(&col);
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    DenseTensor col_matrix;
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+
+    DenseTensor dfilter_;
+    funcs::SetConstant<Context, T> set_zero;
+
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    funcs::ConcatFunctor<Context, T> concat_functor;
+
+    if (dx) {
+      ctx.template Alloc<T>(dx);
+      set_zero(ctx, dx, static_cast<T>(0));
+    }
+    if (dfilter) {  // dfilter_ size (i_c, o_c/g, k_h, k_w)
+      ctx.template Alloc<T>(dfilter);
+      set_zero(ctx, dfilter, static_cast<T>(0));
+      dfilter_ = *dfilter;
+      dfilter_.Resize(filter_matrix_shape);
+    }
+
+    size_t D = x.dims().size();
+    for (int i = 0; i < batch_size; i++) {
+      // batch with size (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for
+      // channel_first
+      // batch with size (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for
+      // channel_last
+      DenseTensor dout_batch = dout.Slice(i, i + 1).Resize(output_shape);
+
+      if (data_dim == 2U) {
+        // im2col: dy -> col matrix
+        // from (o_c, o_h, o_w) to (o_c * k_h * k_w, i_h * i_w) for
+        // channel_first
+        // from (o_h, o_w, o_c) to (o_c * k_h * k_w, i_h * i_w) for
+        // channel_last
+        im2col(ctx,
+               dout_batch,
+               dilations_,
+               strides,
+               std::vector<int>{
+                   paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
+               &col,
+               data_layout);
+      } else if (data_dim == 3U) {
+        // vol2col: dy -> col_matrix
+        // from (o_c, o_d, o_h, o_w) to (o_c * k_d * k_h * k_w, i_d * i_h *
+        // i_w) for channel_first
+        // from (o_d, o_h, o_w, o_c) to (i_d * i_h * i_w, o_c * k_d * k_h *
+        // k_w) for channel_last
+        vol2col(
+            ctx, dout_batch, dilations_, strides, paddings_, &col, data_layout);
+      }
+      if (dx) {
+        // batch with size (i_c, i_h, i_w) or (i_h, i_w, i_c)
+        DenseTensor dx_batch = dx->Slice(i, i + 1).Resize(x_matrix_shape);
+
+        // gemm: dx = filter * dy
+        // (i_c, o_c * k_h * k_w) * (o_c * k_h * k_w, i_h * i_w) -> (i_c, i_h
+        // * i_w)
+        // or
+        // (i_c, o_c * k_d * k_h * k_w) * (o_c * k_d * k_h * k_w, i_d * i_h *
+        // i_w) -> (i_c,
+        // i_d, i_h, i_w)
+        // gemm: dx = dy^T * filter^T for channel_last
+
+        std::vector<DenseTensor> dx_batch_vec;
+        for (int g = 0; g < groups; g++) {
+          // dx_slice: (i_c/g, i_h * i_w) or (i_c/g, i_d * i_h * i_w)
+          // for channel_first
+          // dx_slice: (i_h * i_w, i_c/g) or (i_d * i_h * i_w, i_c/g)
+          // for channel_last
+          // filter_slice: (i_c/g, o_c/g * k_h * k_w)
+          DenseTensor filter_slice =
+              filter_.Slice(g * in_step, (g + 1) * in_step);
+          // col_matrix_slice: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d *
+          // k_h * k_w, d * h * w)
+          DenseTensor col_matrix_slice =
+              col_matrix.Slice(g * col_step, (g + 1) * col_step);
+          if (data_layout != DataLayout::kNHWC) {
+            DenseTensor dx_slice =
+                dx_batch.Slice(g * in_step, (g + 1) * in_step);
+            blas.MatMul(filter_slice,
+                        false,
+                        col_matrix_slice,
+                        false,
+                        static_cast<T>(1.0),
+                        &dx_slice,
+                        static_cast<T>(0.0));
+          } else {
+            DenseTensor dx_slice;
+            funcs::Slice<Context, T, 2>(
+                ctx, &dx_batch, &dx_slice, g * in_step, (g + 1) * in_step, 1);
+            blas.MatMul(col_matrix_slice,
+                        true,
+                        filter_slice,
+                        true,
+                        static_cast<T>(1.0),
+                        &dx_slice,
+                        static_cast<T>(0.0));
+            DDim dx_slice_shape;
+            if (data_dim == 2U) {
+              dx_slice_shape = {x_dims[1], x_dims[2], in_step};
+            } else {
+              dx_slice_shape = {x_dims[1], x_dims[2], x_dims[3], in_step};
+            }
+            dx_slice = dx_slice.Resize(dx_slice_shape);
+            dx_batch_vec.push_back(dx_slice);
+          }
+        }
+        if (data_layout == DataLayout::kNHWC) {
+          concat_functor(ctx, dx_batch_vec, static_cast<int>(D - 2), &dx_batch);
+        }
+      }
+      if (dfilter) {
+        // x batch: (i_c, i_h * i_w) or (i_h, i_w * i_c)
+        DenseTensor in_batch = x.Slice(i, i + 1).Resize(x_matrix_shape);
+        // gemm: d_filter = x * dy^T
+        // (i_c, i_h * i_w) * (i_h * i_w, o_c * k_h * k_w) -> (i_c, o_c * k_h
+        // * k_w)
+        // or
+        // (i_c, i_d * i_h * i_w) * (i_d * i_h * i_w, o_c * k_d * k_h * k_w)
+        // -> (i_c, o_c * k_d *
+        // k_h * k_w)
+        // gemm: d_filter = x^T * dy^T for channel_last
+
+        for (int g = 0; g < groups; g++) {
+          DenseTensor dfilter_slice =
+              dfilter_.Slice(g * in_step, (g + 1) * in_step);
+          DenseTensor col_matrix_slice =
+              col_matrix.Slice(g * col_step, (g + 1) * col_step);
+          if (data_layout != DataLayout::kNHWC) {
+            DenseTensor in_batch_slice =
+                in_batch.Slice(g * in_step, (g + 1) * in_step);
+            blas.MatMul(in_batch_slice,
+                        false,
+                        col_matrix_slice,
+                        true,
+                        static_cast<T>(1.0),
+                        &dfilter_slice,
+                        static_cast<T>(1.0));
+          } else {
+            DenseTensor in_batch_slice;
+            funcs::Slice<Context, T, 2>(ctx,
+                                        &in_batch,
+                                        &in_batch_slice,
+                                        g * in_step,
+                                        (g + 1) * in_step,
+                                        1);
+            blas.MatMul(in_batch_slice,
+                        true,
+                        col_matrix_slice,
+                        true,
+                        static_cast<T>(1.0),
+                        &dfilter_slice,
+                        static_cast<T>(1.0));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeGradKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& filter,
+                               const DenseTensor& dout,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& output_padding,
+                               const std::vector<int>& output_size,
+                               const std::string& padding_algorithm,
+                               int groups,
+                               const std::vector<int>& dilations,
+                               const std::string& data_format,
+                               DenseTensor* dx,
+                               DenseTensor* dfilter) {
+  ConvTransposeGradRawKernel<T, Context>(ctx,
+                                         x,
+                                         filter,
+                                         dout,
+                                         strides,
+                                         paddings,
+                                         padding_algorithm,
+                                         groups,
+                                         dilations,
+                                         data_format,
+                                         dx,
+                                         dfilter);
+}
+
+template <typename T, typename Context>
+void Conv3dTransposeGradKernel(const Context& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& filter,
+                               const DenseTensor& dout,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& output_padding,
+                               const std::vector<int>& output_size,
+                               const std::string& padding_algorithm,
+                               int groups,
+                               const std::vector<int>& dilations,
+                               const std::string& data_format,
+                               DenseTensor* dx,
+                               DenseTensor* dfilter) {
+  ConvTransposeGradRawKernel<T, Context>(ctx,
+                                         x,
+                                         filter,
+                                         dout,
+                                         strides,
+                                         paddings,
+                                         padding_algorithm,
+                                         groups,
+                                         dilations,
+                                         data_format,
+                                         dx,
+                                         dfilter);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
new file mode 100644
index 0000000000000..ee2faf761fe32
--- /dev/null
+++ b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
@@ -0,0 +1,278 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/conv_transpose_kernel.h"
+
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConvTransposeRawKernel(const Context& ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& filter,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            const std::string& padding_algorithm,
+                            int groups,
+                            const std::vector<int>& dilations,
+                            const std::string& data_format,
+                            DenseTensor* out) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  // The filter will be reshaped, so it should not be constant
+  DenseTensor filter_ = filter;
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  auto x_dims = x.dims();
+  auto filter_dims = filter_.dims();
+  auto out_dims = out->dims();
+  const int batch_size = static_cast<int>(x.dims()[0]);
+
+  DDim in_data_dims;
+  if (data_layout != DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  } else {
+    in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
+
+  // x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
+  // x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
+  std::vector<int64_t> x_shape_vec = vectorize(x.dims());
+  // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec = vectorize(filter_.dims());
+
+  // use col_shape in the im2col and col2im (or vol2col and col2vol)
+  // calculation
+  // col_shape_vec: {o_c/g, k_h, k_w, h, w} or {o_c/g, k_d, k_h, k_w, d, h, w}
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  if (data_layout != DataLayout::kNHWC) {
+    col_shape_vec[0] = out_dims[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 2];
+    }
+  } else {
+    col_shape_vec[0] = out_dims[out_dims.size() - 1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1];
+    }
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+
+  // use col_matrix_shape in the gemm calculation
+  // size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w)
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+
+  DenseTensor col;
+  col.Resize(col_shape);
+  ctx.template Alloc<T>(&col);
+  // col_matrix shares the same piece of data with col,
+  // but will be reshaped into a two-dimensional matrix shape
+  // to call the matrix multiplication interface.
+  DenseTensor col_matrix;
+  col_matrix.ShareDataWith(col);
+  col_matrix.Resize(col_matrix_shape);
+
+  // out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
+  // out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
+  DDim out_shape = slice_ddim(out->dims(), 1, out->dims().size());
+
+  // x matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
+  // x matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
+  DDim x_matrix_shape;
+  if (data_layout != DataLayout::kNHWC) {
+    x_matrix_shape = {x_dims[1], col_matrix_shape[1]};
+  } else {
+    x_matrix_shape = {col_matrix_shape[1], x_dims[x_dims.size() - 1]};
+  }
+
+  // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
+  DDim filter_matrix_shape;
+  if (data_layout != DataLayout::kNHWC) {
+    filter_matrix_shape = {x_dims[1], col_matrix_shape[0]};
+  } else {
+    filter_matrix_shape = {x_dims[x_dims.size() - 1], col_matrix_shape[0]};
+  }
+  filter_.Resize(filter_matrix_shape);
+
+  ctx.template Alloc<T>(out);
+
+  funcs::SetConstant<Context, T> set_zero;
+
+  auto blas = funcs::GetBlas<Context, T>(ctx);
+  set_zero(ctx, out, static_cast<T>(0));
+
+  int in_step = (data_layout != DataLayout::kNHWC
+                     ? static_cast<int>(x_dims[1]) / groups
+                     : static_cast<int>(x_dims[x_dims.size() - 1]) / groups);
+
+  int out_step =
+      (data_layout != DataLayout::kNHWC
+           ? static_cast<int>(out_dims[1]) / groups
+           : static_cast<int>(out_dims[out_dims.size() - 1]) / groups);
+  paddle::operators::math::
+      Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+          col2im;
+  paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
+  funcs::ConcatFunctor<Context, T> concat_functor;
+
+  // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
+  // on x)
+  size_t D = x.dims().size();
+  for (int i = 0; i < batch_size; i++) {
+    // batch with size (i_c, h * w) or (i_c, d * h * w) for channel_first
+    // batch with size (h * w, i_c) or (d * h * w, i_c) for channel_last
+    DenseTensor x_batch = x.Slice(i, i + 1).Resize(x_matrix_shape);
+
+    // out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
+    // out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
+    DenseTensor out_batch = out->Slice(i, i + 1).Resize(out_shape);
+
+    std::vector<DenseTensor> out_batch_vec;
+    for (int g = 0; g < groups; g++) {
+      int64_t start = g * in_step;
+      int64_t end = (g + 1) * in_step;
+      int axes = (data_layout != DataLayout::kNHWC ? 0 : 1);
+      DenseTensor filter_slice = filter_.Slice(g * in_step, (g + 1) * in_step);
+      DenseTensor in_slice, out_slice;
+
+      // col_matrix = filter_slice * x_slice
+      // of shape (o_c/g * k_h * k_w, h * w)
+      // or (o_c/g * k_d * k_h * k_w, d * h * w)
+      if (data_layout != DataLayout::kNHWC) {
+        in_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
+        out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+        blas.MatMul(filter_slice,
+                    true,
+                    in_slice,
+                    false,
+                    static_cast<T>(1.0),
+                    &col_matrix,
+                    static_cast<T>(0.0));
+      } else {
+        funcs::Slice<Context, T, 2>(ctx, &x_batch, &in_slice, start, end, axes);
+        start = g * out_step;
+        end = (g + 1) * out_step;
+        axes = D - 2;
+        if (D == 4U) {
+          funcs::Slice<Context, T, 3>(
+              ctx, &out_batch, &out_slice, start, end, axes);
+        } else if (D == 5U) {
+          funcs::Slice<Context, T, 4>(
+              ctx, &out_batch, &out_slice, start, end, axes);
+        }
+        blas.MatMul(filter_slice,
+                    true,
+                    in_slice,
+                    true,
+                    static_cast<T>(1.0),
+                    &col_matrix,
+                    static_cast<T>(0.0));
+      }
+
+      if (data_dim == 2U) {
+        // col2im: col_matrix -> dy from (o_c/g * k_h * k_w, h * w) to (o_c/g,
+        // o_h, o_w) or (o_h, o_w, o_c/g)
+        col2im(ctx,
+               col,
+               dilations_,
+               strides,
+               std::vector<int>{
+                   paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
+               &out_slice,
+               data_layout);
+      } else if (data_dim == 3U) {
+        // col2vol: col_matrix -> dy from (o_c/g * k_d * k_h * k_w, d * h * w)
+        // to (o_c/g, o_d, o_h, o_w) or (o_d, o_h, o_w, o_c/g)
+        col2vol(
+            ctx, col, dilations_, strides, paddings_, &out_slice, data_layout);
+      }
+      if (data_layout == DataLayout::kNHWC) {
+        out_batch_vec.push_back(out_slice);
+      }
+    }
+    if (data_layout == DataLayout::kNHWC) {
+      concat_functor(ctx, out_batch_vec, static_cast<int>(D - 2), &out_batch);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::vector<int>& output_padding,
+                           const std::vector<int>& output_size,
+                           const std::string& padding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           DenseTensor* out) {
+  ConvTransposeRawKernel<T, Context>(ctx,
+                                     x,
+                                     filter,
+                                     strides,
+                                     paddings,
+                                     padding_algorithm,
+                                     groups,
+                                     dilations,
+                                     data_format,
+                                     out);
+}
+
+template <typename T, typename Context>
+void Conv3dTransposeKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::vector<int>& output_padding,
+                           const std::vector<int>& output_size,
+                           const std::string& padding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           DenseTensor* out) {
+  ConvTransposeRawKernel<T, Context>(ctx,
+                                     x,
+                                     filter,
+                                     strides,
+                                     paddings,
+                                     padding_algorithm,
+                                     groups,
+                                     dilations,
+                                     data_format,
+                                     out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
new file mode 100644
index 0000000000000..d8795808a643d
--- /dev/null
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -0,0 +1,173 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace phi {
+
+template <typename T>
+HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data,
+                                const int data_width,
+                                const int height,
+                                const int width,
+                                T h,
+                                T w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh;
+  T hw = 1 - lw;
+
+  T v1 =
+      (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0;
+  T v2 = (h_low >= 0 && w_high <= width - 1)
+             ? bottom_data[h_low * data_width + w_high]
+             : 0;
+  T v3 = (h_high <= height - 1 && w_low >= 0)
+             ? bottom_data[h_high * data_width + w_low]
+             : 0;
+  T v4 = (h_high <= height - 1 && w_high <= width - 1)
+             ? bottom_data[h_high * data_width + w_high]
+             : 0;
+
+  T w1 = hh * hw;
+  T w2 = hh * lw;
+  T w3 = lh * hw;
+  T w4 = lh * lw;
+
+  return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+}
+
+template <typename T, typename Context>
+void ModulatedDeformableIm2col(const Context& dev_ctx,
+                               const T* data_im,
+                               const T* data_offset,
+                               const T* data_mask,
+                               const std::vector<int64_t>& im_shape,
+                               const std::vector<int64_t>& col_shape,
+                               const std::vector<int64_t>& filter_shape,
+                               const std::vector<int>& paddings,
+                               const std::vector<int>& strides,
+                               const std::vector<int>& dilations,
+                               const int deformable_groups,
+                               T* data_col);
+
+template <typename T, typename Context>
+void DeformableConvKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& offset,
+                          const DenseTensor& filter,
+                          const DenseTensor& mask,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::vector<int>& dilations,
+                          int deformable_groups,
+                          int groups,
+                          int im2col_step,
+                          DenseTensor* out) {
+  const int batch_size = static_cast<int>(x.dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(phi::vectorize(out->dims()));
+
+  // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
+  std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
+  col_buffer_shape_vec[0] = x.dims()[1] * filter.dims()[2] * filter.dims()[3];
+  col_buffer_shape_vec[1] = im2col_step;
+  for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
+    col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
+  }
+
+  std::vector<int64_t> output_buffer_shape_vec(1);
+  output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
+                               output_shape_vec[2] * output_shape_vec[3];
+
+  DenseTensor col_buffer = Empty<T>(dev_ctx, col_buffer_shape_vec);
+  DenseTensor output_buffer = Empty<T>(dev_ctx, output_buffer_shape_vec);
+
+  int64_t M = output_shape_vec[1] / groups;
+  int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
+  int64_t K = x.dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
+
+  DenseTensor weight_3d;
+  weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
+
+  DenseTensor col_buffer_3d;
+  col_buffer_3d.ShareDataWith(col_buffer)
+      .Resize(phi::make_ddim({groups, K, N}));
+
+  DenseTensor output_4d;
+  output_4d.ShareDataWith(output_buffer)
+      .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
+
+  DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
+  std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
+
+  int input_dim = x.numel() / x.dims()[0];
+  int input_offset_dim = offset.numel() / offset.dims()[0];
+  int input_mask_dim = mask.numel() / mask.dims()[0];
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  const T* input_ptr = x.data<T>();
+  const T* offset_ptr = offset.data<T>();
+  const T* mask_ptr = mask.data<T>();
+  T* col_buffer_ptr = col_buffer.data<T>();
+
+  for (int i = 0; i < batch_size / im2col_step; ++i) {
+    ModulatedDeformableIm2col(dev_ctx,
+                              input_ptr + i * im2col_step * input_dim,
+                              offset_ptr + i * im2col_step * input_offset_dim,
+                              mask_ptr + i * im2col_step * input_mask_dim,
+                              input_shape_vec,
+                              col_buffer_shape_vec,
+                              filter_shape_vec,
+                              paddings,
+                              strides,
+                              dilations,
+                              deformable_groups,
+                              col_buffer_ptr);
+    DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(
+        phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
+    // get the product of pixel and weight
+    for (int g = 0; g < groups; ++g) {
+      DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
+          phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
+      DenseTensor col_buffer_3d_slice =
+          col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
+              col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
+      DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
+          phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
+      blas.MatMul(weight_3d_slice,
+                  false,
+                  col_buffer_3d_slice,
+                  false,
+                  T(1.0),
+                  &output_3d_slice,
+                  T(0.0));
+    }
+  }
+  out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
new file mode 100644
index 0000000000000..e4356e9af3937
--- /dev/null
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -0,0 +1,159 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/determinant_grad_kernel.h"
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/matrix_inverse.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+namespace detail {
+
+template <typename T>
+struct FoundZeroFunctor {
+  FoundZeroFunctor(const T* x, int64_t numel, bool* res)
+      : x_(x), numel_(numel), res_(res) {}
+  HOSTDEVICE void operator()(size_t idx) const {
+    if (*res_ || idx >= static_cast<size_t>(numel_)) {
+      // founded zero number
+      return;
+    }
+    *res_ = (x_[idx] == static_cast<T>(0));
+  }
+  const T* x_;
+  int64_t numel_;
+  bool* res_;
+};
+
+template <typename T, typename Context>
+inline bool CheckMatrixInvertible(const Context& dev_ctx,
+                                  const DenseTensor* det) {
+  auto numel = det->numel();
+
+  DenseTensor dev_tensor = phi::Empty<bool, Context>(dev_ctx, {1});
+
+  // set false
+  phi::funcs::SetConstant<Context, bool> zero;
+  zero(dev_ctx, &dev_tensor, false);
+
+  // find whether zero
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  FoundZeroFunctor<T> functor(det->data<T>(), numel, dev_tensor.data<bool>());
+  for_range(functor);
+
+  // copy to host
+  DenseTensor cpu_tensor;
+  phi::Copy<Context>(dev_ctx, dev_tensor, phi::CPUPlace(), false, &cpu_tensor);
+
+  // if founded zero, the matrix is not invertible
+  // else the matrix is invertible
+  auto* res = cpu_tensor.data<bool>();
+  return !(*res);
+}
+
+}  // namespace detail
+
+template <typename T, typename Context>
+void DeterminantGradKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& out,
+                           const DenseTensor& out_grad,
+                           DenseTensor* x_grad) {
+  auto input_dims_size = x.dims().size();
+  if (input_dims_size > 2) {
+    PADDLE_ENFORCE_EQ(
+        out_grad.dims().size() + 2,
+        input_dims_size,
+        phi::errors::InvalidArgument(
+            "The grad tensor of det dims size should be 2 less than"
+            " input tensor's, but here differ %d",
+            input_dims_size - out_grad.dims().size()));
+  } else if (input_dims_size == 2) {
+    // input dims size 2 and grad dims size 1 is possible
+    PADDLE_ENFORCE_EQ(
+        out_grad.dims().size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The grad tensor of det dims size should be 2 less than"
+            " input tensor's, but here differ %d",
+            input_dims_size - out_grad.dims().size()));
+  } else {
+    // checked in forward, pass
+  }
+
+  // Check Whether the matrix is invertible
+  // (matrix A not invertible) == (det(A)=0)
+  if (!detail::CheckMatrixInvertible<T, Context>(dev_ctx, &out)) {
+    // The matrix is not invertible
+    VLOG(3) << "The input matrix not invertible!";
+    x_grad->Resize(x.dims());
+    phi::Full<T>(
+        dev_ctx, phi::vectorize(x.dims()), static_cast<T>(0.0f), x_grad);
+    return;
+  }
+
+  // The matrix is invertible
+  // let |A| = Determinant(A)
+  // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
+  // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
+  // -1)
+
+  // First: inverse(A)
+  DenseTensor inverse_A;
+  // A must be square matrices!
+  inverse_A.Resize(x.dims());
+  dev_ctx.template Alloc<T>(&inverse_A);
+
+  phi::funcs::MatrixInverseFunctor<Context, T> mat_inv;
+  mat_inv(dev_ctx, x, &inverse_A);
+
+  VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
+
+  // Second: inverse(A).transpose(-2, -1)
+  DenseTensor transpose_inverse_A =
+      phi::TransposeLast2Dim<T>(dev_ctx, inverse_A);
+
+  VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
+          << transpose_inverse_A.dims();
+
+  // Third: dA * |A|
+  auto mul_dA_detA = phi::Multiply<T>(dev_ctx, out_grad, out);
+  VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
+
+  // Fourth: unsqueeze(dA * |A|, [-1, -2])
+  auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1);
+  auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2);
+  VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
+
+  // Finally: unsqueeze(dA * |A|) * inverse(A)
+  auto res = phi::Multiply<T>(dev_ctx, unsqueeze2, transpose_inverse_A);
+
+  VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
+
+  x_grad->Resize(x.dims());
+  VLOG(3) << "d|A| dims: " << x_grad->dims();
+
+  phi::Copy(dev_ctx, res, dev_ctx.GetPlace(), false, x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h
new file mode 100644
index 0000000000000..f3a611b89c95c
--- /dev/null
+++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/determinant_kernel.h"
+
+#include <Eigen/Dense>
+#include <Eigen/LU>
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "paddle/phi/core/enforce.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace phi {
+namespace detail {
+template <typename T>
+class EigenMatrix {};
+
+template <>
+class EigenMatrix<float> {
+ public:
+  using MatrixType = Eigen::MatrixXf;
+};
+
+template <>
+class EigenMatrix<double> {
+ public:
+  using MatrixType = Eigen::MatrixXd;
+};
+
+inline int64_t GetBatchCount(const DDim dims) {
+  int64_t batch_count = 1;
+  auto dim_size = dims.size();
+  PADDLE_ENFORCE_GE(
+      dim_size,
+      2,
+      phi::errors::InvalidArgument(
+          "the input matrix dimension size should greater than 2."));
+
+  // Cumulative multiplying each dimension until the last 2 to get the batch
+  // count,
+  // for example a tensor with shape [3,3,3,3], the batch count of matrices is
+  // 9.
+  for (int64_t i = 0; i < dims.size() - 2; i++) {
+    batch_count *= dims[i];
+  }
+
+  return batch_count;
+}
+}  // namespace detail
+
+template <typename T, typename Context>
+struct DeterminantFunctor {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  int64_t rank,
+                  int64_t batch_count,
+                  DenseTensor* output) {
+    std::vector<T> input_vec;
+    std::vector<T> output_vec;
+    paddle::framework::TensorToVector(input, dev_ctx, &input_vec);
+    for (int64_t i = 0; i < batch_count; ++i) {  // maybe can be parallel
+      auto begin_iter = input_vec.begin() + i * rank * rank;
+      auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
+      std::vector<T> sub_vec(begin_iter,
+                             end_iter);  // get every square matrix data
+      typename detail::EigenMatrix<T>::MatrixType matrix(rank, rank);
+      for (int64_t i = 0; i < rank; ++i) {
+        for (int64_t j = 0; j < rank; ++j) {
+          matrix(i, j) = sub_vec[rank * i + j];
+        }
+      }
+      output_vec.push_back(matrix.determinant());
+    }
+    paddle::framework::TensorFromVector(output_vec, output);
+  }
+};
+
+template <typename T, typename Context>
+void DeterminantKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out) {
+  auto input_dim = vectorize(x.dims());
+  auto input_dim_size = input_dim.size();
+
+  auto batch_count = detail::GetBatchCount(x.dims());
+  VLOG(10) << "input dim:" << x.dims();
+  PADDLE_ENFORCE_GE(
+      input_dim_size,
+      2,
+      phi::errors::InvalidArgument(
+          "the input matrix dimension size should greater than 2."));
+  PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1],
+                    input_dim[input_dim_size - 2],
+                    phi::errors::InvalidArgument(
+                        "the input matrix should be square matrix."));
+  auto rank = input_dim[input_dim_size - 1];  // square matrix length
+  DeterminantFunctor<T, Context>()(dev_ctx, x, rank, batch_count, out);
+  auto output_dims = phi::slice_ddim(x.dims(), 0, input_dim_size - 2);
+  if (input_dim_size > 2) {
+    out->Resize(output_dims);
+  } else {
+    // when input is a two-dimension matrix, The det value is a number.
+    out->Resize({1});
+  }
+  VLOG(10) << "output dim:" << out->dims();
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
index 5b71fd7fa3a5e..5e06435b28e27 100644
--- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -16,11 +16,11 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/math_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
diff --git a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
new file mode 100644
index 0000000000000..1877a4ecc227e
--- /dev/null
+++ b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/frobenius_norm_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FrobeniusNormGradKernel(const Context& ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& out,
+                             const DenseTensor& dout,
+                             const std::vector<int64_t>& axis,
+                             bool keep_dim,
+                             bool reduce_all,
+                             DataType in_dtype,
+                             DataType out_dtype,
+                             DenseTensor* dx) {
+  ReduceGradKernel<Context, T, funcs::FrobeniusNormGradFunctor>(
+      ctx, x, out, dout, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
similarity index 55%
rename from paddle/phi/kernels/cpu/reduce_max_kernel.cc
rename to paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
index f9ea0aa0faf06..8577a4e3c6345 100644
--- a/paddle/phi/kernels/cpu/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h
@@ -12,28 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#pragma once
+
+#include "paddle/phi/kernels/frobenius_norm_kernel.h"
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/reduce.h"
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 
 namespace phi {
 
 template <typename T, typename Context>
-void MaxRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+void FrobeniusNormKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const std::vector<int64_t>& axis,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DenseTensor* out) {
+  Reduce<Context, T, funcs::FrobeniusNormFunctor>(
+      ctx, x, reduce_all, axis, keep_dim, x.dtype(), out);
 }
 
 }  // namespace phi
-
-PD_REGISTER_KERNEL(
-    max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
new file mode 100644
index 0000000000000..25247ceaff6c0
--- /dev/null
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -0,0 +1,176 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cmath>
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+// TODO(xiongkun): remove the header when decouple the memcpy function in phi.
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace phi {
+using Tensor = DenseTensor;
+template <typename DeviceContext, typename T>
+struct GetTensorValue {
+  T operator()(const DeviceContext& ctx, const DenseTensor& tensor) const;
+};
+
+template <typename DeviceContext, typename T>
+struct IscloseFunctor {
+  void operator()(const DeviceContext& ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const float rtol,
+                  const float atol,
+                  bool equal_nan,
+                  DenseTensor* output);
+};
+
+template <typename T>
+struct GetTensorValue<phi::CPUContext, T> {
+  T operator()(const phi::CPUContext& dev_ctx,
+               const DenseTensor& tensor) const {
+    return *(tensor.data<T>());
+  }
+};
+
+template <typename T>
+struct GetTensorValue<phi::GPUContext, T> {
+  T operator()(const phi::GPUContext& dev_ctx,
+               const DenseTensor& tensor) const {
+    const T* data = tensor.data<T>();
+    T value;
+    const auto gpu_place = dev_ctx.GetPlace();
+    paddle::memory::Copy(
+        phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
+    return value;
+  }
+};
+
+template <typename T>
+struct IscloseFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const double rtol,
+                  const double atol,
+                  bool equal_nan,
+                  DenseTensor* output) {
+    auto* in_a = in.data<T>();
+    auto* in_b = other.data<T>();
+    auto* out_data = ctx.template Alloc<bool>(output);
+    auto num = in.numel();
+    // *out_data = true;
+    for (int i = 0; i < num; i++) {
+      out_data[i] = true;
+    }
+    for (int i = 0; i < num; i++) {
+      const T a = in_a[i], b = in_b[i];
+      bool val;
+      if (std::isnan(a) || std::isnan(b)) {
+        val = equal_nan && std::isnan(a) == std::isnan(b);
+      } else {
+        T left = (a > b ? a - b : b - a);
+        T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+        T diff = (left > right ? left - right : right - left);
+        val = a == b || left <= right || diff <= 1e-15;
+      }
+      // *out_data &= val;
+      out_data[i] = val;
+    }
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+__global__ void IscloseCUDAKernel(const T* in_data,
+                                  const T* other_data,
+                                  const double rtol,
+                                  const double atol,
+                                  bool equal_nan,
+                                  int num,
+                                  bool* out_data) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  bool val;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    const T a = in_data[i], b = other_data[i];
+    if (isnan(a) || isnan(b)) {
+      val = equal_nan && isnan(a) == isnan(b);
+    } else {
+      T left = (a > b ? a - b : b - a);
+      T right = atol + (b > 0 ? rtol * b : (-rtol) * b);
+      T diff = (left > right ? left - right : right - left);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    out_data[i] = val;
+    // if (!val) *out_data = false;
+  }
+}
+
+template <typename T>
+struct IscloseFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& dev_ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const double rtol,
+                  const double atol,
+                  bool equal_nan,
+                  DenseTensor* output) {
+    int num = in.numel();
+    const T* in_data = in.data<T>();
+    const T* other_data = other.data<T>();
+    bool* out_data = dev_ctx.template Alloc<bool>(output);
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    grid = (grid > block) ? block : grid;
+#ifdef PADDLE_WITH_HIP
+    hipMemset(out_data, true, num * sizeof(bool));
+#else
+    cudaMemset(out_data, true, num * sizeof(bool));
+#endif
+    IscloseCUDAKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        in_data, other_data, rtol, atol, equal_nan, num, out_data);
+  }
+};
+#endif
+
+template <typename T, typename Context>
+void IscloseKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const Scalar& rtol,
+                   const Scalar& atol,
+                   bool equal_nan,
+                   DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      atol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument("Input(Atol) type must be double"));
+
+  PADDLE_ENFORCE_EQ(
+      rtol.dtype(),
+      DataType::FLOAT64,
+      phi::errors::InvalidArgument("Input(Rtol) type must be double"));
+
+  IscloseFunctor<Context, T>()(
+      dev_ctx, x, y, rtol.to<double>(), atol.to<double>(), equal_nan, out);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
new file mode 100644
index 0000000000000..1ae90960ef445
--- /dev/null
+++ b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T>
+struct KLDivLossBackward {
+  HOSTDEVICE KLDivLossBackward() {}
+
+  HOSTDEVICE T operator()(const T& target, const T& grad) const {
+    if (target <= 0) {
+      return 0;
+    } else {
+      return static_cast<T>(-1.) * grad;
+    }
+  }
+};
+
+template <typename T, typename Context>
+void KLDivLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& label,
+                         const DenseTensor& d_out,
+                         const std::string& reduction,
+                         DenseTensor* d_x) {
+  auto& place = *dev_ctx.eigen_device();
+  auto* target = &label;
+  auto* input_grad = d_x;
+  auto* loss_grad = &d_out;
+
+  const int n = input_grad->dims()[0];
+  const int numel = input_grad->numel();
+  const int expand = numel / loss_grad->numel();
+
+  dev_ctx.template Alloc<T>(input_grad);
+
+  auto target_t = phi::EigenVector<T>::Flatten(*target);
+
+  auto input_grad_t = phi::EigenVector<T>::Flatten(*input_grad);
+  auto loss_grad_t = phi::EigenVector<T>::Flatten(*loss_grad);
+
+  auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
+  auto grad_t = target_t * loss_grad_expand;
+  input_grad_t.device(place) =
+      target_t.binaryExpr(grad_t, KLDivLossBackward<T>());
+
+  if ("mean" == reduction) {
+    input_grad_t.device(place) = input_grad_t / static_cast<T>(numel);
+  } else if ("batchmean" == reduction) {
+    input_grad_t.device(place) = input_grad_t / static_cast<T>(n);
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
new file mode 100644
index 0000000000000..ecd23bbfc1c45
--- /dev/null
+++ b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T>
+struct KLDivLossForward {
+  HOSTDEVICE KLDivLossForward() {}
+
+  HOSTDEVICE T operator()(const T& target, const T& input) const {
+    if (target <= 0) {
+      return 0;
+    } else {
+      return target * (std::log(target) - input);
+    }
+  }
+};
+template <typename T, typename Context>
+void KLDivLossKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& label,
+                     const std::string& reduction,
+                     DenseTensor* out) {
+  auto& place = *(dev_ctx.eigen_device());
+  auto* input = &x;
+  auto* target = &label;
+  auto* loss = out;
+
+  const int n = input->dims()[0];
+  dev_ctx.template Alloc<T>(loss);
+
+  auto input_t = phi::EigenVector<T>::Flatten(*input);
+  auto target_t = phi::EigenVector<T>::Flatten(*target);
+  auto loss_t = phi::EigenVector<T>::Flatten(*loss);
+  auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
+  if ("none" == reduction) {
+    loss_t.device(place) = output;
+  } else if ("batchmean" == reduction) {
+    auto output_sum = output.sum();
+    if (n > 0) {
+      loss_t.device(place) = output_sum / output_sum.constant(n);
+    } else {
+      loss_t.device(place) = output_sum;
+    }
+  } else if ("mean" == reduction) {
+    loss_t.device(place) = output.mean();
+  } else if ("sum" == reduction) {
+    loss_t.device(place) = output.sum();
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
new file mode 100644
index 0000000000000..8fb1f1c4fa361
--- /dev/null
+++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <unsupported/Eigen/SpecialFunctions>
+#include "paddle/phi/kernels/funcs/for_range.h"
+namespace phi {
+template <typename T>
+struct LgammaGradFunctor {
+  LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
+      : dout_(dout), x_(x), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]);
+  }
+
+ private:
+  const T* dout_;
+  const T* x_;
+  T* output_;
+  int64_t numel_;
+};
+template <typename T, typename Context>
+void LgammaGradKernel(const Context& dev_ctx,
+                      const DenseTensor& d_out,
+                      const DenseTensor& x,
+                      DenseTensor* d_x) {
+  auto numel = d_out.numel();
+  auto* dout_data = d_out.data<T>();
+  auto* x_data = x.data<T>();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(d_x, static_cast<size_t>(numel * sizeof(T)));
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  LgammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+  for_range(functor);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h
similarity index 100%
rename from paddle/phi/kernels/cpu/reduce_grad.h
rename to paddle/phi/kernels/impl/reduce_grad.h
index f56d3d3ed50f7..0b1c43b5f0402 100644
--- a/paddle/phi/kernels/cpu/reduce_grad.h
+++ b/paddle/phi/kernels/impl/reduce_grad.h
@@ -87,8 +87,8 @@ template <typename Context,
           bool kNoNeedBufferY = false>
 void ReduceGradKernel(const Context& dev_ctx,
                       const DenseTensor& x,
-                      const DenseTensor& out_grad,
                       const paddle::optional<DenseTensor>& out,
+                      const DenseTensor& out_grad,
                       const std::vector<int64_t>& dims,
                       bool keep_dim,
                       bool reduce_all,
diff --git a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
new file mode 100644
index 0000000000000..0a0c1abac8086
--- /dev/null
+++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceMaxGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
+                                                           x,
+                                                           out,
+                                                           out_grad,
+                                                           dims,
+                                                           keep_dim,
+                                                           reduce_all,
+                                                           in_dtype,
+                                                           out_dtype,
+                                                           x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
new file mode 100644
index 0000000000000..965fc686e2783
--- /dev/null
+++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceMinGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
+                                                           x,
+                                                           out,
+                                                           out_grad,
+                                                           dims,
+                                                           keep_dim,
+                                                           reduce_all,
+                                                           in_dtype,
+                                                           out_dtype,
+                                                           x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
new file mode 100644
index 0000000000000..fb361e3420558
--- /dev/null
+++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/reduce_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::ProdGradFunctor>(dev_ctx,
+                                                       x,
+                                                       out,
+                                                       out_grad,
+                                                       dims,
+                                                       keep_dim,
+                                                       reduce_all,
+                                                       in_dtype,
+                                                       out_dtype,
+                                                       x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/reverse_kernel_impl.h b/paddle/phi/kernels/impl/reverse_kernel_impl.h
new file mode 100644
index 0000000000000..acdd46a086583
--- /dev/null
+++ b/paddle/phi/kernels/impl/reverse_kernel_impl.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/reverse_kernel.h"
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename Context, typename T, int Rank>
+struct ReverseFunctor {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& in,
+                  DenseTensor* out,
+                  const std::vector<int>& axis) {
+    Eigen::DSizes<bool, Rank> reverse_axis;
+    for (int i = 0; i < Rank; ++i) {
+      reverse_axis[i] = false;
+    }
+    for (int a : axis) {
+      if (a >= 0) {
+        reverse_axis[a] = true;
+      } else {
+        reverse_axis[Rank + a] = true;
+      }
+    }
+
+    auto in_eigen = EigenTensor<T, Rank>::From(in);
+    auto out_eigen = EigenTensor<T, Rank>::From(*out);
+    auto& dev = *dev_ctx.eigen_device();
+
+    funcs::EigenReverse<std::decay_t<decltype(dev)>, T, Rank>::Eval(
+        dev, out_eigen, in_eigen, reverse_axis);
+  }
+};
+
+template <typename T, typename Context>
+void ReverseKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int>& axis,
+                   DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  int rank = x.dims().size();
+
+  switch (rank) {
+    case 1:
+      ReverseFunctor<Context, T, 1> functor1;
+      functor1(dev_ctx, x, out, axis);
+      break;
+    case 2:
+      ReverseFunctor<Context, T, 2> functor2;
+      functor2(dev_ctx, x, out, axis);
+      break;
+    case 3:
+      ReverseFunctor<Context, T, 3> functor3;
+      functor3(dev_ctx, x, out, axis);
+      break;
+    case 4:
+      ReverseFunctor<Context, T, 4> functor4;
+      functor4(dev_ctx, x, out, axis);
+      break;
+    case 5:
+      ReverseFunctor<Context, T, 5> functor5;
+      functor5(dev_ctx, x, out, axis);
+      break;
+    case 6:
+      ReverseFunctor<Context, T, 6> functor6;
+      functor6(dev_ctx, x, out, axis);
+      break;
+    default:
+      PADDLE_THROW(phi::errors::OutOfRange(
+          "The reserve operator does not support input tensors"
+          "whose ranks are greater than 6."));
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h b/paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h
new file mode 100644
index 0000000000000..c74aa5c7243f3
--- /dev/null
+++ b/paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SqueezeGradKernel(const Context& dev_ctx,
+                       const DenseTensor& xshape,
+                       const DenseTensor& dout,
+                       const std::vector<int>& axes,
+                       DenseTensor* dx) {
+  auto xshape_dims = xshape.dims();
+  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+  dev_ctx.template Alloc<T>(dx);
+  phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
+  dx->Resize(x_dims);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/squeeze_kernel_impl.h b/paddle/phi/kernels/impl/squeeze_kernel_impl.h
new file mode 100644
index 0000000000000..d2b40824a91c9
--- /dev/null
+++ b/paddle/phi/kernels/impl/squeeze_kernel_impl.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SqueezeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int>& axes,
+                   DenseTensor* xshape,
+                   DenseTensor* out) {
+  auto x_dims = x.dims();
+  auto out_dims = funcs::GetOutputSqueezeShape(axes, x_dims, true);
+
+  dev_ctx.template Alloc<T>(out);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  out->Resize(out_dims);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
index 9b1e4b1d3a65d..044adb0230cac 100644
--- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -21,12 +21,11 @@
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/matrix_reduce.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/tril_triu_op.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -119,7 +118,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx,
     const auto H = dims[dims.size() - 2];
     const auto W = dims[dims.size() - 1];
     phi::funcs::ForRange<Context> x_for_range(dev_ctx, dx_bst.numel());
-    paddle::operators::TrilTriuCompute<T> tril_triu_functor(
+    phi::funcs::TrilTriuCompute<T> tril_triu_functor(
         dx_bst.data<T>(), unitriangular, !upper, H, W, dx_bst_upper.data<T>());
     x_for_range(tril_triu_functor);
 
diff --git a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
new file mode 100644
index 0000000000000..dcc7224b5075c
--- /dev/null
+++ b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/tril_triu_grad_kernel.h"
+
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        int diagonal,
+                        bool lower,
+                        DenseTensor* x_grad) {
+  const auto* dout_data = out_grad.data<T>();
+  auto* dx_data = ctx.template Alloc<T>(x_grad);
+
+  const auto& dims = out_grad.dims();
+  const auto H = dims[dims.size() - 2];
+  const auto W = dims[dims.size() - 1];
+
+  phi::funcs::ForRange<Context> for_range(
+      ctx, static_cast<size_t>(out_grad.numel()));
+  phi::funcs::TrilTriuCompute<T> tril_triu_grad_computer(
+      dout_data, diagonal, lower, H, W, dx_data);
+  for_range(tril_triu_grad_computer);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
new file mode 100644
index 0000000000000..959169d87cefd
--- /dev/null
+++ b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/tril_triu_kernel.h"
+
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    int diagonal,
+                    bool lower,
+                    DenseTensor* out) {
+  const auto* x_data = x.data<T>();
+  auto* out_data = ctx.template Alloc<T>(out);
+
+  const auto& dims = x.dims();
+  const auto H = dims[dims.size() - 2];
+  const auto W = dims[dims.size() - 1];
+  phi::funcs::ForRange<Context> for_range(ctx, static_cast<size_t>(x.numel()));
+
+  phi::funcs::TrilTriuCompute<T> tril_triu_computer(
+      x_data, diagonal, lower, H, W, out_data);
+  for_range(tril_triu_computer);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h b/paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h
new file mode 100644
index 0000000000000..54b332ea4c898
--- /dev/null
+++ b/paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+template <typename T, typename Context>
+void UnsqueezeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x_shape,
+                         const DenseTensor& dout,
+                         DenseTensor* dx) {
+  auto xshape_dims = x_shape.dims();
+  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  dev_ctx.template Alloc<T>(dx);
+  phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), true, dx);
+  dx->Resize(x_dims);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
new file mode 100644
index 0000000000000..884fa26df451c
--- /dev/null
+++ b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+
+namespace phi {
+template <typename T, typename Context>
+void UnsqueezeKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const ScalarArray& axes,
+                     DenseTensor* xshape,
+                     DenseTensor* out) {
+  auto x_dims = x.dims();
+  auto out_dims = out->dims();
+  if (axes.FromTensor()) {
+    std::vector<int32_t> tmp;
+    tmp.reserve(axes.GetData().size());
+    std::for_each(axes.GetData().begin(),
+                  axes.GetData().end(),
+                  [&tmp](const int64_t& t) { tmp.push_back(t); });
+    out_dims = funcs::GetUnsqueezeShape(tmp, x_dims);
+  }
+  out->Resize(out_dims);
+  dev_ctx.template Alloc<T>(out);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  out->Resize(out_dims);  // copy will reset the dims.
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/index_select_grad_kernel.h b/paddle/phi/kernels/index_select_grad_kernel.h
new file mode 100644
index 0000000000000..c3dc1595989bf
--- /dev/null
+++ b/paddle/phi/kernels/index_select_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           const DenseTensor& out_grad,
+                           int dim,
+                           DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/index_select_kernel.h b/paddle/phi/kernels/index_select_kernel.h
new file mode 100644
index 0000000000000..124b689731157
--- /dev/null
+++ b/paddle/phi/kernels/index_select_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSelectKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       int dim,
+                       DenseTensor* output);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/isclose_kernel.h b/paddle/phi/kernels/isclose_kernel.h
new file mode 100644
index 0000000000000..8c468da055082
--- /dev/null
+++ b/paddle/phi/kernels/isclose_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IscloseKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const Scalar& rtol,
+                   const Scalar& atol,
+                   bool equal_nan,
+                   DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kldiv_loss_grad_kernel.h b/paddle/phi/kernels/kldiv_loss_grad_kernel.h
new file mode 100644
index 0000000000000..8f53898fa6816
--- /dev/null
+++ b/paddle/phi/kernels/kldiv_loss_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+// XKTODO (change name)
+void KLDivLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& label,
+                         const DenseTensor& d_out,
+                         const std::string& reduction,
+                         DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kldiv_loss_kernel.h b/paddle/phi/kernels/kldiv_loss_kernel.h
new file mode 100644
index 0000000000000..103780ab74728
--- /dev/null
+++ b/paddle/phi/kernels/kldiv_loss_kernel.h
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KLDivLossKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& label,
+                     const std::string& reduction,
+                     DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kthvalue_grad_kernel.h b/paddle/phi/kernels/kthvalue_grad_kernel.h
new file mode 100644
index 0000000000000..488dde8237b08
--- /dev/null
+++ b/paddle/phi/kernels/kthvalue_grad_kernel.h
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+template <typename T, typename Context>
+void KthvalueGradKernel(const Context& dev_ctx,
+                        const DenseTensor& d_out,
+                        const DenseTensor& x,
+                        const DenseTensor& indices,
+                        int k,
+                        int axis,
+                        bool keepdim,
+                        DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/phi/kernels/kthvalue_kernel.h b/paddle/phi/kernels/kthvalue_kernel.h
new file mode 100644
index 0000000000000..4809b9af4832f
--- /dev/null
+++ b/paddle/phi/kernels/kthvalue_kernel.h
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void KthvalueKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int k,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* out,
+                    DenseTensor* indices);
+}  // namespace phi
diff --git a/paddle/phi/kernels/layer_norm_grad_kernel.h b/paddle/phi/kernels/layer_norm_grad_kernel.h
new file mode 100644
index 0000000000000..c32be63db4178
--- /dev/null
+++ b/paddle/phi/kernels/layer_norm_grad_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormGradKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         paddle::optional<const DenseTensor&> scale,
+                         paddle::optional<const DenseTensor&> bias,
+                         const DenseTensor& out_grad,
+                         float epsilon,
+                         int begin_norm_axis,
+                         bool is_test,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
new file mode 100644
index 0000000000000..c9679420bda5c
--- /dev/null
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     paddle::optional<const DenseTensor&> scale,
+                     paddle::optional<const DenseTensor&> bias,
+                     float epsilon,
+                     int begin_norm_axis,
+                     bool is_test,
+                     DenseTensor* out,
+                     DenseTensor* mean,
+                     DenseTensor* variance);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename T>
+class LayerNormDirectCUDAFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T* input,
+                  std::vector<int> input_shape,
+                  const T* bias,
+                  const T* scale,
+                  T* output,
+                  T* mean,
+                  T* variance,
+                  int begin_norm_axis,
+                  float eps);
+};
+#endif
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/lgamma_grad_kernel.h b/paddle/phi/kernels/lgamma_grad_kernel.h
new file mode 100644
index 0000000000000..94173cc29c7a7
--- /dev/null
+++ b/paddle/phi/kernels/lgamma_grad_kernel.h
@@ -0,0 +1,27 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LgammaGradKernel(const Context& dev_ctx,
+                      const DenseTensor& d_out,
+                      const DenseTensor& x,
+                      DenseTensor* d_x);
+}  // namespace phi
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/phi/kernels/lgamma_kernel.h
similarity index 63%
rename from paddle/fluid/operators/reduce_ops/reduce_any_op.cu
rename to paddle/phi/kernels/lgamma_kernel.h
index 2e93e67debbd9..f61b3a1ce859e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/phi/kernels/lgamma_kernel.h
@@ -1,4 +1,5 @@
-// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,9 +13,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_any,
-    ops::ReduceCudaKernel<bool, kps::LogicalOrFunctor, kps::IdentityFunctor>);
+template <typename T, typename Context>
+void LgammaKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_softmax_grad_kernel.h b/paddle/phi/kernels/log_softmax_grad_kernel.h
new file mode 100644
index 0000000000000..6336bc14105bb
--- /dev/null
+++ b/paddle/phi/kernels/log_softmax_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          int axis,
+                          DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/log_softmax_kernel.h b/paddle/phi/kernels/log_softmax_kernel.h
new file mode 100644
index 0000000000000..2caaa86d46c35
--- /dev/null
+++ b/paddle/phi/kernels/log_softmax_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h
deleted file mode 100644
index 7569cbcff087d..0000000000000
--- a/paddle/phi/kernels/math_kernel.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/infermeta/binary.h"
-#include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void MeanRawKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const std::vector<int64_t>& dims,
-                   bool keep_dim,
-                   bool reduce_all,
-                   DenseTensor* out);
-
-template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& dims,
-                bool keep_dim,
-                DenseTensor* out);
-
-template <typename T, typename Context>
-void SumRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
-                  DataType out_dtype,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               DataType out_dtype,
-               bool keep_dim,
-               DenseTensor* out);
-
-template <typename T, typename Context>
-void AddRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  int axis,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void AddKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& y,
-               DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
-template <typename T, typename Context>
-DenseTensor Add(const Context& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Subtract(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Divide(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Multiply(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Mean(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 const std::vector<int64_t>& axis,
-                 bool keep_dim) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
-  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Sum(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& axis,
-                DataType dtype,
-                bool keep_dim) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
-  SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
-  return dense_out;
-}
-
-}  // namespace phi
diff --git a/paddle/phi/kernels/mode_grad_kernel.h b/paddle/phi/kernels/mode_grad_kernel.h
new file mode 100644
index 0000000000000..ccde8c3648fa5
--- /dev/null
+++ b/paddle/phi/kernels/mode_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& indices,
+                    const DenseTensor& out_grad,
+                    int axis,
+                    bool keepdim,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/mode_kernel.h b/paddle/phi/kernels/mode_kernel.h
new file mode 100644
index 0000000000000..831c4369304e5
--- /dev/null
+++ b/paddle/phi/kernels/mode_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ModeKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool keepdim,
+                DenseTensor* out,
+                DenseTensor* indices);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multiplex_grad_kernel.h b/paddle/phi/kernels/multiplex_grad_kernel.h
new file mode 100644
index 0000000000000..b32c9dbe10058
--- /dev/null
+++ b/paddle/phi/kernels/multiplex_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexGradKernel(const Context& ctx,
+                         const DenseTensor& ids,
+                         const DenseTensor& out_grad,
+                         std::vector<DenseTensor*> ins_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/multiplex_kernel.h b/paddle/phi/kernels/multiplex_kernel.h
new file mode 100644
index 0000000000000..341c6d5cabb7c
--- /dev/null
+++ b/paddle/phi/kernels/multiplex_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplexKernel(const Context& ctx,
+                     const std::vector<const DenseTensor*>& ins,
+                     const DenseTensor& ids,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/one_hot_kernel.cc
similarity index 62%
rename from paddle/phi/kernels/reduce_max_kernel.cc
rename to paddle/phi/kernels/one_hot_kernel.cc
index de172a12d7288..633f48cbb62ac 100644
--- a/paddle/phi/kernels/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/one_hot_kernel.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/reduce_max_kernel.h"
+#include "paddle/phi/kernels/one_hot_kernel.h"
 
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -20,20 +20,19 @@
 namespace phi {
 
 template <typename T, typename Context>
-void MaxKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out) {
-  bool reduce_all = false;
-  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+void OneHotKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const Scalar& num_classes_s,
+                  DenseTensor* out) {
+  int num_classes = num_classes_s.to<int>();
+  OneHotRawKernel<T>(
+      dev_ctx, x, num_classes, phi::DataType::FLOAT32, false, out);
 }
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(one_hot, CPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(
-    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(one_hot, GPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/one_hot_kernel.h
similarity index 64%
rename from paddle/phi/kernels/reduce_max_kernel.h
rename to paddle/phi/kernels/one_hot_kernel.h
index 7560473d43c71..9f89609ea6336 100644
--- a/paddle/phi/kernels/reduce_max_kernel.h
+++ b/paddle/phi/kernels/one_hot_kernel.h
@@ -14,25 +14,23 @@
 
 #pragma once
 
+#include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/infermeta/binary.h"
-#include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
 
 template <typename T, typename Context>
-void MaxRawKernel(const Context& dev_ctx,
+void OneHotKernel(const Context& dev_ctx,
                   const DenseTensor& x,
-                  const std::vector<int64_t>& dims,
-                  bool keep_dim,
-                  bool reduce_all,
+                  const Scalar& num_classes,
                   DenseTensor* out);
 
 template <typename T, typename Context>
-void MaxKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               DenseTensor* out);
+void OneHotRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     int32_t depth,
+                     DataType dtype,
+                     bool allow_out_of_range,
+                     DenseTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/pad3d_grad_kernel.h b/paddle/phi/kernels/pad3d_grad_kernel.h
new file mode 100644
index 0000000000000..38f1e5335e8c2
--- /dev/null
+++ b/paddle/phi/kernels/pad3d_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pad3dGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& out_grad,
+                     const ScalarArray& paddings,
+                     const std::string& mode,
+                     float pad_value,
+                     const std::string& data_format,
+                     DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/pad3d_kernel.h b/paddle/phi/kernels/pad3d_kernel.h
new file mode 100644
index 0000000000000..d8876c3e7bc74
--- /dev/null
+++ b/paddle/phi/kernels/pad3d_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pad3dKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const ScalarArray& paddings,
+                 const std::string& mode,
+                 float pad_value,
+                 const std::string& data_format,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/prelu_grad_kernel.h b/paddle/phi/kernels/prelu_grad_kernel.h
new file mode 100644
index 0000000000000..15917e2e1f02e
--- /dev/null
+++ b/paddle/phi/kernels/prelu_grad_kernel.h
@@ -0,0 +1,31 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& alpha,
+                     const DenseTensor& out_grad,
+                     const std::string& mode,
+                     const std::string& data_format,
+                     DenseTensor* x_grad,
+                     DenseTensor* alpha_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/prelu_kernel.h b/paddle/phi/kernels/prelu_kernel.h
new file mode 100644
index 0000000000000..251332a8158dc
--- /dev/null
+++ b/paddle/phi/kernels/prelu_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& alpha,
+                 const std::string& mode,
+                 const std::string& data_format,
+                 DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 632ad00f6d06e..e02f4450a8bab 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -22,7 +22,6 @@
 #endif
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-// #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
 namespace phi {
@@ -591,7 +590,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
     int index = (tidx + 1) * 2 * stride - 1;
     if (index < (blockDim.x * 2)) {
       temp[index + index / 32] =
-          compute(temp[index + index / 2],
+          compute(temp[index + index / 32],
                   temp[index - stride + (index - stride) / 32]);
     }
   }
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 2f1e2f589c512..993349f2d9e14 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -115,6 +115,23 @@ struct BroadcastConfig {
   }
 };
 
+template <typename T>
+__device__ __forceinline__ void WriteData(T* dst,
+                                          T* __restrict__ src,
+                                          int num) {
+  for (int i = 0; i < num; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ void ReadData(T* dst,
+                                         const T* __restrict__ src,
+                                         int num) {
+  for (int i = 0; i < num; i++) {
+    dst[i] = src[i];
+  }
+}
 #undef INT_BITS
 }  // namespace details
 
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 53a8b7d0c9ef9..d2cfdbdec3064 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -76,6 +76,16 @@ struct BroadcastConfig {
 };
 #pragma pack()
 
+template <typename T>
+__device__ __forceinline__ void WriteData(T* _global_ptr_ dst,
+                                          T* src,
+                                          int num) {
+  if (num > 0) {
+    LM2GM(src, dst, num * sizeof(T));
+  }
+}
+#undef INT_BITS
+
 }  // namespace details
 
 /**
diff --git a/paddle/phi/kernels/qr_kernel.h b/paddle/phi/kernels/qr_kernel.h
new file mode 100644
index 0000000000000..9c3dfb1660126
--- /dev/null
+++ b/paddle/phi/kernels/qr_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void QrKernel(const Context& ctx,
+              const DenseTensor& x,
+              const std::string& mode,
+              DenseTensor* q,
+              DenseTensor* r);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_grad_kernel.h b/paddle/phi/kernels/reduce_grad_kernel.h
new file mode 100644
index 0000000000000..a4b472c445888
--- /dev/null
+++ b/paddle/phi/kernels/reduce_grad_kernel.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceSumGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceMeanGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceProdGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DataType in_dtype,
+                          DataType out_dtype,
+                          DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceMaxGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+template <typename T, typename Context>
+void ReduceMinGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DataType in_dtype,
+                         DataType out_dtype,
+                         DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_kernel.cc b/paddle/phi/kernels/reduce_kernel.cc
new file mode 100644
index 0000000000000..7638c782d547d
--- /dev/null
+++ b/paddle/phi/kernels/reduce_kernel.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               DataType out_dtype,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void ProdKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  ProdRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void MinKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  MinRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void AllKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  AllRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void AnyKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  AnyRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(
+    mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
+
+PD_REGISTER_KERNEL(sum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+
+PD_REGISTER_KERNEL(
+    prod, CPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
+PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+PD_REGISTER_KERNEL(mean,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MeanKernel,
+                   float,
+                   double,
+                   bool,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(sum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+
+PD_REGISTER_KERNEL(
+    prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(
+    max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
+PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_kernel.h b/paddle/phi/kernels/reduce_kernel.h
new file mode 100644
index 0000000000000..69bcb47bc98ea
--- /dev/null
+++ b/paddle/phi/kernels/reduce_kernel.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void ProdRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MinRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AnyRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AllRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               DataType out_dtype,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void ProdKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void MaxKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void MinKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void AnyKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void AllKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Mean(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const std::vector<int64_t>& axis,
+                 bool keep_dim) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out);
+  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Sum(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& axis,
+                DataType dtype,
+                bool keep_dim) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  SumInferMeta(x, axis, dtype, keep_dim, &meta_out);
+  SumKernel<T, Context>(dev_ctx, x, axis, dtype, keep_dim, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reverse_kernel.cc b/paddle/phi/kernels/reverse_kernel.cc
new file mode 100644
index 0000000000000..c6c2781a07bf6
--- /dev/null
+++ b/paddle/phi/kernels/reverse_kernel.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reverse_kernel.h"
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReverseArrayKernel(const Context& dev_ctx,
+                        const std::vector<const DenseTensor*>& x,
+                        const std::vector<int>& axis,
+                        std::vector<DenseTensor*> out) {
+  PADDLE_ENFORCE_EQ(
+      x.size(),
+      out.size(),
+      phi::errors::InvalidArgument("The input size(%d) and output size(%d) of "
+                                   "ReverseArrayKernel is different.",
+                                   x.size(),
+                                   out.size()));
+  for (size_t offset = 0; offset < x.size(); ++offset) {
+    auto* x_tensor = x.at(offset);
+    PADDLE_ENFORCE_GT(
+        x_tensor->memory_size(),
+        0,
+        phi::errors::PreconditionNotMet(
+            "The input LoDTensorArray X[%d] holds no memory.", offset));
+    auto out_offset = x.size() - offset - 1;
+    auto* out_tensor = out.at(out_offset);
+
+    out_tensor->set_lod(x_tensor->lod());
+    phi::Copy<Context>(
+        dev_ctx, *x_tensor, dev_ctx.GetPlace(), false, out_tensor);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(reverse_array,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReverseArrayKernel,
+                   int,
+                   uint8_t,
+                   int64_t,
+                   bool,
+                   float,
+                   double) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(reverse_array,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReverseArrayKernel,
+                   int,
+                   uint8_t,
+                   int64_t,
+                   bool,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/reverse_kernel.h b/paddle/phi/kernels/reverse_kernel.h
new file mode 100644
index 0000000000000..2b81f4018c25d
--- /dev/null
+++ b/paddle/phi/kernels/reverse_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReverseKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int>& axis,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void ReverseArrayKernel(const Context& dev_ctx,
+                        const std::vector<const DenseTensor*>& x,
+                        const std::vector<int>& axis,
+                        std::vector<DenseTensor*> out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_align_grad_kernel.h b/paddle/phi/kernels/roi_align_grad_kernel.h
new file mode 100644
index 0000000000000..eea1fa03886a4
--- /dev/null
+++ b/paddle/phi/kernels/roi_align_grad_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_align_kernel.h b/paddle/phi/kernels/roi_align_kernel.h
index 16b52c563a592..9734da53b7f45 100644
--- a/paddle/phi/kernels/roi_align_kernel.h
+++ b/paddle/phi/kernels/roi_align_kernel.h
@@ -20,7 +20,7 @@
 namespace phi {
 
 template <typename T, typename Context>
-void ROIAlignKernel(const Context& dev_ctx,
+void RoiAlignKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& boxes,
                     paddle::optional<const DenseTensor&> boxes_num,
diff --git a/paddle/phi/kernels/roi_pool_grad_kernel.h b/paddle/phi/kernels/roi_pool_grad_kernel.h
new file mode 100644
index 0000000000000..d7f1c378f75c3
--- /dev/null
+++ b/paddle/phi/kernels/roi_pool_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiPooGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& boxes,
+                      paddle::optional<const DenseTensor&> boxes_num,
+                      const DenseTensor& arg_max,
+                      const DenseTensor& out_grad,
+                      int pooled_height,
+                      int pooled_width,
+                      float spatial_scale,
+                      DenseTensor* dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roi_pool_kernel.h b/paddle/phi/kernels/roi_pool_kernel.h
new file mode 100644
index 0000000000000..c6ff6f223612a
--- /dev/null
+++ b/paddle/phi/kernels/roi_pool_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+static constexpr int kROISize = 4;
+
+template <typename T, typename Context>
+void RoiPoolKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& boxes,
+                   paddle::optional<const DenseTensor&> boxes_num,
+                   int pooled_height,
+                   int pooled_width,
+                   float spatial_scale,
+                   DenseTensor* out,
+                   DenseTensor* arg_max);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roll_grad_kernel.h b/paddle/phi/kernels/roll_grad_kernel.h
new file mode 100644
index 0000000000000..331f3626e5657
--- /dev/null
+++ b/paddle/phi/kernels/roll_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const ScalarArray& shifts,
+                    const std::vector<int64_t>& axis,
+                    DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/roll_kernel.h b/paddle/phi/kernels/roll_kernel.h
new file mode 100644
index 0000000000000..56f32174a4c00
--- /dev/null
+++ b/paddle/phi/kernels/roll_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RollKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const ScalarArray& shifts,
+                const std::vector<int64_t>& axis,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
new file mode 100644
index 0000000000000..fae876facfc8f
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/assign_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/assign_kernel.h"
+
+namespace phi {
+namespace sr {
+
+// Note: use `const paddle::optional<const SelectedRows&> x`
+// as input if needed
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  phi::AssignKernel<Context>(dev_ctx, x.value(), out->mutable_value());
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(assign_sr,
+                           CPU,
+                           ALL_LAYOUT,
+                           phi::sr::AssignKernel<phi::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(assign_sr,
+                           GPU,
+                           ALL_LAYOUT,
+                           phi::sr::AssignKernel<phi::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.h b/paddle/phi/kernels/selected_rows/assign_kernel.h
new file mode 100644
index 0000000000000..2ba465615a73a
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void AssignKernel(const Context& dev_ctx,
+                  const SelectedRows& x,
+                  SelectedRows* out);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.cc b/paddle/phi/kernels/selected_rows/copy_kernel.cc
new file mode 100644
index 0000000000000..cf71ab0583f61
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/copy_kernel.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/selected_rows/copy_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const SelectedRows& src,
+          Place dst_place,
+          bool blocking,
+          SelectedRows* dst) {
+  if (src.value().Holder() != dst->value().Holder() ||
+      src.value().data() != dst->value().data()) {
+    dst->set_rows(src.rows());
+    dst->set_height(src.height());
+  }
+  phi::Copy<Context>(
+      dev_ctx, src.value(), dst_place, blocking, dst->mutable_value());
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_GENERAL_KERNEL(
+    copy_sr, CPU, ALL_LAYOUT, phi::sr::Copy<phi::CPUContext>, ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_GENERAL_KERNEL(
+    copy_sr, GPU, ALL_LAYOUT, phi::sr::Copy<phi::GPUContext>, ALL_DTYPE) {}
+#endif
diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.h b/paddle/phi/kernels/selected_rows/copy_kernel.h
new file mode 100644
index 0000000000000..4aa848bea2a71
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/copy_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+namespace phi {
+namespace sr {
+
+template <typename Context>
+void Copy(const Context& dev_ctx,
+          const SelectedRows& src,
+          Place dst_place,
+          bool blocking,
+          SelectedRows* dst);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
new file mode 100644
index 0000000000000..80b2a1f6678a2
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h"
+
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h"
+
+namespace phi {
+namespace sr {
+
+static std::vector<int64_t> PathToRows(const DenseTensor& path) {
+  std::set<int64_t> rows;
+  const int64_t* paths = path.data<int64_t>();
+  for (int64_t i = 0; i < path.numel(); ++i) {
+    int64_t row = paths[i];
+    if (row < 0) {
+      continue;
+    }
+    rows.emplace(row);
+  }
+  return std::vector<int64_t>(rows.begin(), rows.end());
+}
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   SelectedRows* w_grad,
+                                   DenseTensor* bias_grad) {
+  PADDLE_ENFORCE_NOT_NULL(
+      path.get_ptr(),
+      errors::NotFound("Custom tree must be set for sparse mode!"));
+  paddle::framework::Vector<int64_t> real_rows = PathToRows(*path);
+  w_grad->set_rows(real_rows);
+  // Build a map of id -> row_index to speed up finding the index of one id
+  w_grad->set_height(w.dims()[0]);
+  auto* w_grad_value = w_grad->mutable_value();
+  phi::DDim temp_dim(w.dims());
+  temp_dim[0] = real_rows.size();
+  w_grad_value->Resize(temp_dim);
+  phi::HierarchicalSigmoidGradKernelImpl<T>(ctx,
+                                            x,
+                                            w,
+                                            label,
+                                            pre_out,
+                                            out_grad,
+                                            path,
+                                            code,
+                                            bias,
+                                            num_classes,
+                                            remote_prefetch,
+                                            trainer_id,
+                                            height_sections,
+                                            epmap,
+                                            table_names,
+                                            is_sparse,
+                                            x_grad,
+                                            w_grad_value,
+                                            bias_grad,
+                                            w_grad);
+}
+
+}  // namespace sr
+}  // namespace phi
+
+PD_REGISTER_KERNEL(hierarchical_sigmoid_grad_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::HierarchicalSigmoidGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
new file mode 100644
index 0000000000000..557c8b1bc5eed
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void HierarchicalSigmoidGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& w,
+                                   const DenseTensor& label,
+                                   const DenseTensor& pre_out,
+                                   const DenseTensor& out_grad,
+                                   paddle::optional<const DenseTensor&> path,
+                                   paddle::optional<const DenseTensor&> code,
+                                   paddle::optional<const DenseTensor&> bias,
+                                   int num_classes,
+                                   bool remote_prefetch,
+                                   int trainer_id,
+                                   const std::vector<int64_t>& height_sections,
+                                   const std::vector<std::string>& epmap,
+                                   const std::vector<std::string>& table_names,
+                                   bool is_sparse,
+                                   DenseTensor* x_grad,
+                                   SelectedRows* w_grad,
+                                   DenseTensor* bias_grad);
+
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index 9bcd5d8544e2d..67126d82042b2 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/shape_kernel.h"
 
 namespace phi {
 namespace sr {
@@ -25,15 +26,7 @@ template <typename T, typename Context>
 void ShapeKernel(const Context& ctx,
                  const SelectedRows& input,
                  DenseTensor* out) {
-  auto in_var = input;
-  phi::DDim in_dims;
-  in_dims = in_var.value().dims();
-  auto out_t = out;
-  out_t->Resize({in_dims.size()});
-  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
-  for (int i = 0; i < in_dims.size(); ++i) {
-    out_data[i] = in_dims[i];
-  }
+  phi::ShapeKernel<T, Context>(ctx, input.value(), out);
 }
 
 }  // namespace sr
diff --git a/paddle/phi/kernels/gpu/shape_kernel.cu b/paddle/phi/kernels/shape_kernel.cc
similarity index 53%
rename from paddle/phi/kernels/gpu/shape_kernel.cu
rename to paddle/phi/kernels/shape_kernel.cc
index 39b6eaeaef2a8..dd26a7edc9cdd 100644
--- a/paddle/phi/kernels/gpu/shape_kernel.cu
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -13,12 +13,43 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/shape_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/float16.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/shape_kernel_impl.h"
 
+namespace phi {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& ctx,
+                 const DenseTensor& input,
+                 DenseTensor* out) {
+  auto in_var = &input;
+  phi::DDim in_dims;
+  in_dims = in_var->dims();
+  auto out_t = out;
+  out_t->Resize({in_dims.size()});
+  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = in_dims[i];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(shape,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   bool,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(shape,
                    GPU,
                    ALL_LAYOUT,
@@ -33,3 +64,4 @@ PD_REGISTER_KERNEL(shape,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>,
                    phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
index 42bde442e1e06..23e059c72e776 100644
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -27,7 +27,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& rulebook,
                       const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
@@ -41,7 +41,7 @@ std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
                                     const SparseCooTensor& x,
                                     const DenseTensor& rulebook,
                                     const DenseTensor& kernel,
-                                    const SparseCooTensor& out_grad,
+                                    const DenseTensor& out_grad,
                                     const std::vector<int>& paddings,
                                     const std::vector<int>& dilations,
                                     const std::vector<int>& strides,
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index 64c32df18971c..93a335e2f1c35 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -34,7 +34,7 @@ using Dims4D = phi::funcs::sparse::Dims4D;
 template <typename T, typename Context>
 void ProductRuleBook(const Context& dev_ctx,
                      const SparseCooTensor& x,
-                     const DenseTensor& kernel,
+                     const std::vector<int>& kernel_sizes,
                      const std::vector<int>& paddings,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
@@ -42,19 +42,19 @@ void ProductRuleBook(const Context& dev_ctx,
                      const bool subm,
                      DenseTensor* rulebook,
                      DenseTensor* counter_per_kernel) {
-  const auto& kernel_dims = kernel.dims();
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
   const int* indices_ptr = non_zero_indices.data<int>();
   int* counter_ptr = counter_per_kernel->data<int>();
-  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
   memset(counter_ptr, 0, kernel_size * sizeof(int));
 
   int rulebook_len = 0;
   // calc the rulebook_len
   const auto& x_dims = x.dims();
   const Dims4D c_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
-  const Dims4D c_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
+  const Dims4D c_kernel_dims(
+      1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]);
   const Dims4D c_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
   const Dims4D c_paddings(1, paddings[2], paddings[1], paddings[0]);
   const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
@@ -75,9 +75,9 @@ void ProductRuleBook(const Context& dev_ctx,
 
   auto f_calc_rulebook = [&](int* rulebook_ptr) {
     int kernel_index = 0, rulebook_index = 0;
-    for (int kz = 0; kz < kernel_dims[0]; kz++) {
-      for (int ky = 0; ky < kernel_dims[1]; ky++) {
-        for (int kx = 0; kx < kernel_dims[2]; kx++) {
+    for (int kz = 0; kz < kernel_sizes[0]; kz++) {
+      for (int ky = 0; ky < kernel_sizes[1]; ky++) {
+        for (int kx = 0; kx < kernel_sizes[2]; kx++) {
           ++kernel_index;
           for (int64_t i = 0; i < non_zero_num; i++) {
             int batch = indices_ptr[i];
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index 5d7b381b7cb0b..3348d81cf6b4b 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -33,7 +33,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& rulebook,
                       const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
@@ -113,7 +113,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
             rulebook_len,
             in_channels,
             in_features_ptr);
-  Gather<T>(out_grad.non_zero_elements().data<T>(),
+  Gather<T>(out_grad.data<T>(),
             rulebook_ptr + rulebook_len * 2,
             rulebook_len,
             out_channels,
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index 746ca04a826c0..f022e4ef4bb63 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -44,8 +44,13 @@ void Conv3dKernel(const Context& dev_ctx,
   const auto& kernel_dims = kernel.dims();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   DDim out_dims = {1, 1, 1, 1, 1};
+  std::vector<int> kernel_sizes(kernel_dims.size());
+  for (int i = 0; i < kernel_dims.size(); i++) {
+    kernel_sizes[i] = kernel_dims[i];
+  }
+
   phi::funcs::sparse::GetOutShape(
-      x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+      x_dims, kernel_sizes, paddings, dilations, strides, &out_dims);
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
 
@@ -63,7 +68,7 @@ void Conv3dKernel(const Context& dev_ctx,
 
   ProductRuleBook<T, Context>(dev_ctx,
                               x,
-                              kernel,
+                              kernel_sizes,
                               subm_paddings,
                               dilations,
                               subm_strides,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
new file mode 100644
index 0000000000000..3010d480b55c9
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const SparseCooTensor& out,
+                       const DenseTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       DenseTensor* x_grad) {
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int channels = x.dims()[4];
+  int rulebook_len = rulebook.dims()[1];
+  const int* rulebook_ptr = rulebook.data<int>();
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[rulebook_ptr[i]] += 1;
+  }
+  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  const T* out_features_ptr = out.non_zero_elements().data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = x_grad->data<T>();
+  memset(x_grad_ptr, 0, sizeof(T) * x_grad->numel());
+
+  phi::funcs::MaxPoolGrad<T> grad_functor;
+  for (int i = 0; i < kernel_size; i++) {
+    for (int j = 0; j < counter[i]; j++) {
+      int in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
+      int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
+      for (int c = 0; c < channels; c++) {
+        grad_functor.compute(in_features_ptr[in_i * channels + c],
+                             out_features_ptr[out_i * channels + c],
+                             out_grad_ptr[out_i * channels + c],
+                             1,
+                             &x_grad_ptr[in_i * channels + c]);
+      }
+    }
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
new file mode 100644
index 0000000000000..86971242df5ae
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook) {
+  const auto& x_dims = x.dims();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const std::vector<int>& real_kernel_sizes =
+      phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
+  DDim out_dims = {1, 1, 1, 1, 1};
+  phi::funcs::sparse::GetOutShape(
+      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
+  const int in_channels = real_kernel_sizes[3];
+
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  // 1. product rule book
+  ProductRuleBook<T, Context>(dev_ctx,
+                              x,
+                              real_kernel_sizes,
+                              paddings,
+                              dilations,
+                              strides,
+                              out_dims,
+                              false,
+                              rulebook,
+                              &counter_per_kernel);
+
+  UpdateRulebookAndOutIndex<T>(
+      dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out);
+
+  int rulebook_len = rulebook->dims()[1];
+  const int* rulebook_ptr = rulebook->data<int>();
+  const int* counter_ptr = counter_per_kernel.data<int>();
+
+  std::vector<int> offsets(kernel_size + 1);
+  phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size);
+  std::vector<bool> out_flags(out->nnz(), false);
+
+  // 2. max pool
+  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
+  phi::funcs::MaxPool<T> max_pool_functor;
+  for (int i = 0; i < kernel_size; i++) {
+    for (int j = 0; j < counter_ptr[i]; j++) {
+      int in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
+      int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
+      if (!out_flags[out_i]) {
+        out_flags[out_i] = true;
+        memcpy(&out_features_ptr[out_i * in_channels],
+               &in_features_ptr[in_i * in_channels],
+               in_channels * sizeof(T));
+      } else {
+        for (int c = 0; c < in_channels; c++) {
+          max_pool_functor.compute(in_features_ptr[in_i * in_channels + c],
+                                   &out_features_ptr[out_i * in_channels + c]);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 8826fd7cf87e0..5b928817f64d7 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -23,11 +23,15 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 namespace phi {
 namespace sparse {
 
+using Dims4D = phi::funcs::sparse::Dims4D;
+
 // TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
 // this kernel with phi::GatherCUDAKernel;
 // Vectorization can be used to improve read and write bandwidth
@@ -139,5 +143,494 @@ inline int* SortedAndUniqueIndex(const Context& dev_ctx,
   return new_end.first;
 }
 
+template <typename T>
+__global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
+                                              const int n,
+                                              const int rulebook_len,
+                                              const int kernel_size,
+                                              T* rulebook_ptr,
+                                              int* counter_ptr) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int cache_count[];  // kernel_size
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    cache_count[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
+    int index = indexs[i];
+    int kernel_index = rulebook_ptr[index];
+    rulebook_ptr[index + rulebook_len] = -1;
+    rulebook_ptr[index + 2 * rulebook_len] = -1;
+    rulebook_ptr[index] = -1;
+    atomicAdd(&cache_count[kernel_index], 1);
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicSub(&counter_ptr[i], cache_count[i]);
+  }
+}
+
+/**
+ * @brief: update the out index and indices
+ * unique_keys: save the index of the output feature list
+ * unique_values: indiates the index of key before deduplication
+ * out_indexs: indicates the position of the output index in the rulebook
+ * rulebook_len: indicates the length of rulebook
+ * out_dims: indicates the output dims
+ * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
+ * rulebook_out_indexs: the output index in rulebook
+**/
+template <typename T>
+__global__ void UpdateIndexKernel(const int* unique_keys,
+                                  const int* unique_values,
+                                  const int* out_indexs,
+                                  const int non_zero_num,
+                                  const int rulebook_len,
+                                  const Dims4D out_dims,
+                                  T* out_indices,
+                                  T* rulebook_out_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    const int index = unique_keys[i];
+    int batch, x, y, z;
+    phi::funcs::sparse::IndexToPoint<Dims4D>(
+        index, out_dims, &batch, &x, &y, &z);
+    // get out indices
+    out_indices[i] = batch;
+    out_indices[i + non_zero_num] = z;
+    out_indices[i + non_zero_num * 2] = y;
+    out_indices[i + non_zero_num * 3] = x;
+
+    // update rulebook
+    int start = unique_values[i];
+    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
+    // max(end-start) = kernel_size
+    for (int j = start; j < end; j++) {
+      rulebook_out_indexs[out_indexs[j]] = i;
+    }
+  }
+}
+
+// brief: calculation the distance between start and end
+template <typename T>
+__global__ void DistanceKernel(const T* start, const T* end, int* distance) {
+  if (threadIdx.x == 0) {
+    *distance = end - start;
+  }
+}
+
+/**
+ * @brief product rulebook
+ * for input_i in x_indices:
+ *   if input_i participate in the convolution calculation:
+ *       infer the output_i by input_i and kernel_i
+ *       save output_i
+ *
+ * x_indices: the indices of input features
+ * x_dims: the input dims
+ * kernel_dims: the kernel dims
+ * out_dims: the output dims
+ * non_zero_num: the number of input features
+ * rulebook: the rulebook to save the kernel index, input index and output index
+ * counter: save the number of times each location in the kernel participates in
+ *the caculation
+**/
+template <typename T>
+__global__ void ProductRuleBookKernel(const T* x_indices,
+                                      const Dims4D x_dims,
+                                      const Dims4D kernel_dims,
+                                      const Dims4D out_dims,
+                                      const int64_t non_zero_num,
+                                      const Dims4D paddings,
+                                      const Dims4D dilations,
+                                      const Dims4D strides,
+                                      const bool subm,
+                                      T* rulebook,
+                                      int* counter,
+                                      int* in_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int counter_buf[];  // kernel_size
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    int batch = x_indices[i];
+    int in_z = x_indices[i + non_zero_num];
+    int in_y = x_indices[i + 2 * non_zero_num];
+    int in_x = x_indices[i + 3 * non_zero_num];
+    if (subm) {
+      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
+    }
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int in_i = -1, out_index = -1, kernel_i = -1;
+          if (phi::funcs::sparse::Check(x_dims,
+                                        kernel_dims,
+                                        paddings,
+                                        dilations,
+                                        strides,
+                                        in_x,
+                                        in_y,
+                                        in_z,
+                                        kx,
+                                        ky,
+                                        kz)) {
+            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            in_i = i;
+            out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
+                batch, out_x, out_y, out_z, out_dims);
+            atomicAdd(&counter_buf[kernel_index], 1);
+            kernel_i = kernel_index;
+          }
+          rulebook[kernel_index * non_zero_num + i] = kernel_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicAdd(&counter[i], counter_buf[i]);
+  }
+}
+
+// the basic algorithm can refer to convolution_kernel.cc or
+// the second paper
+// example:
+// 1. the rulebook:
+//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
+//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
+// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
+// 3. sorted the (key, value)
+// 4. unique the (key, value):
+//  unique_key:     20, 25, 30, 33
+//  unique_values:  0, 2, 3, 5
+//  the index of unique_values is: 0, 1, 2, 3
+// 5. update the out_index by unique_key, uniqe_value and the index of
+// unique_value:
+//  the new out_index: 0, 2, 3, 2, 3, 0, 1
+template <typename T, typename Context>
+int ProductRuleBook(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    const std::vector<int>& kernel_sizes,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& dilations,
+                    const std::vector<int>& strides,
+                    const DDim& out_dims,
+                    const bool subm,
+                    DenseTensor* rulebook,
+                    DenseTensor* counter_per_kernel,
+                    DenseTensor* offsets_per_kernel,
+                    DenseTensor* out_index,
+                    DenseTensor* unique_key,
+                    DenseTensor* unique_value,
+                    SparseCooTensor* out,
+                    std::vector<int>* h_counter,
+                    std::vector<int>* h_offsets) {
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const int* indices_ptr = non_zero_indices.data<int>();
+  DenseTensor in_indexs = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+  int* counter_ptr = counter_per_kernel->data<int>();
+  int* offsets_ptr = offsets_per_kernel->data<int>();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int rulebook_rows = 3;
+  const int rulebook_cols = kernel_size * non_zero_num;
+  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
+  int* rulebook_ptr = rulebook->data<int>();
+
+  const auto x_dims = x.dims();
+  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  Dims4D d_kernel_dims(1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]);
+  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
+  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
+  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
+
+  // 1. product rule book
+  phi::funcs::SetConstant<Context, int> set_zero;
+  set_zero(dev_ctx, counter_per_kernel, 0);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+
+  ProductRuleBookKernel<int><<<config.block_per_grid.x,
+                               config.thread_per_block.x,
+                               kernel_size * sizeof(int),
+                               dev_ctx.stream()>>>(indices_ptr,
+                                                   d_x_dims,
+                                                   d_kernel_dims,
+                                                   d_out_dims,
+                                                   non_zero_num,
+                                                   d_paddings,
+                                                   d_dilations,
+                                                   d_strides,
+                                                   subm,
+                                                   rulebook_ptr,
+                                                   counter_ptr,
+                                                   in_indexs.data<int>());
+
+// 2. remove -1
+#ifdef PADDLE_WITH_HIP
+  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                             rulebook_ptr,
+                             rulebook_ptr + rulebook_rows * rulebook_cols,
+                             -1);
+
+  DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+      rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
+  int rulebook_len = 0;
+  phi::backends::gpu::GpuMemcpyAsync(
+      &rulebook_len,
+      rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+#ifdef PADDLE_WITH_HIP
+      hipMemcpyDeviceToHost,
+#else
+      cudaMemcpyDeviceToHost,
+#endif
+      dev_ctx.stream());
+  rulebook_len /= 3;
+  dev_ctx.Wait();
+
+  if (subm) {
+    // At present, hashtable is not used to map the input and output indexes.
+    // At present, the intermediate output index is generated by normal
+    // convolution,
+    // and then the intermediate output index is subtracted from the input index
+    // to obain the rulebook.
+    // get difference
+    int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
+    int32_t* B_key_ptr = in_indexs.data<int>();
+    DenseTensor A_val = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+    DenseTensor B_val = phi::Empty<Context>(
+        dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &A_val, kps::IdentityFunctor<int>());
+    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+        dev_ctx, &B_val, kps::IdentityFunctor<int>());
+    DenseTensor key_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW));
+    DenseTensor val_result = phi::Empty<Context>(
+        dev_ctx,
+        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
+
+#ifdef PADDLE_WITH_HIP
+    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                           counter_ptr,
+                           counter_ptr + kernel_size,
+                           offsets_ptr);
+    std::vector<int> offsets(kernel_size, 0);
+    // TODO(zhangkaihuo): used unified memcpy interface
+    phi::backends::gpu::GpuMemcpyAsync(offsets.data(),
+                                       offsets_ptr,
+                                       kernel_size * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+
+    thrust::pair<int*, int*> end;
+    // Because set_diff does not support duplicate data, set_diff is performed
+    // separately for each segment of data.
+    // TODO(zhangkaihuo): Using hashtable here may get better performance,
+    // further tests ared needed.
+    for (int i = 0; i < kernel_size; i++) {
+      int start = offsets[i];
+      int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
+      int* key_result_start = (i == 0 ? key_result.data<int>() : end.first);
+      int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
+      end =
+#ifdef PADDLE_WITH_HIP
+          thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+          thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                                        A_key_ptr + start,
+                                        A_key_ptr + stop,
+                                        B_key_ptr,
+                                        B_key_ptr + x.nnz(),
+                                        A_val.data<int>() + start,
+                                        B_val.data<int>(),
+                                        key_result_start,
+                                        val_result_start);
+    }
+
+    DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+        key_result.data<int>(),
+        end.first,
+        key_result.data<int>() + rulebook_len);
+    int len = 0;
+    phi::backends::gpu::GpuMemcpyAsync(&len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    // set the diff value = -1, and update counter
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
+    SetFlagAndUpdateCounterKernel<int><<<config.block_per_grid.x,
+                                         config.thread_per_block,
+                                         kernel_size * sizeof(int),
+                                         dev_ctx.stream()>>>(
+        val_result.data<int>(),
+        len,
+        rulebook_len,
+        kernel_size,
+        rulebook_ptr,
+        counter_ptr);
+// remove -1
+#ifdef PADDLE_WITH_HIP
+    int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                               rulebook_ptr,
+                               rulebook_ptr + 3 * rulebook_len,
+                               -1);
+    DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>(
+        rulebook_ptr, last, key_result.data<int>() + rulebook_len);
+    phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
+                                       key_result.data<int>() + rulebook_len,
+                                       sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                       hipMemcpyDeviceToHost,
+#else
+                                       cudaMemcpyDeviceToHost,
+#endif
+                                       dev_ctx.stream());
+    dev_ctx.Wait();
+    rulebook_len /= 3;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         counter_ptr,
+                         counter_ptr + kernel_size,
+                         offsets_ptr);
+
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#endif
+  rulebook->Resize({rulebook_rows, rulebook_len});
+
+  // 3. sorted or merge the out index
+  out_index->ResizeAndAllocate({rulebook_len});
+  unique_value->ResizeAndAllocate({rulebook_len});
+  unique_key->ResizeAndAllocate({rulebook_len});
+  int* out_index_ptr = out_index->data<int>();
+  int* unique_value_ptr = unique_value->data<int>();
+  int* unique_key_ptr = unique_key->data<int>();
+
+  int* new_end = SortedAndUniqueIndex(dev_ctx,
+                                      rulebook_ptr + 2 * rulebook_len,
+                                      rulebook_len,
+                                      out_index,
+                                      unique_key,
+                                      unique_value);
+  // thrust::distance doesn't support stream parameters
+  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
+  // new_end.first);
+  DistanceKernel<int><<<1, 1>>>(
+      unique_key_ptr,
+      new_end,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1);
+  int out_non_zero_num = 0;
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      hipMemcpyDeviceToHost,
+      dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
+      sizeof(int),
+      cudaMemcpyDeviceToHost,
+      dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+
+  // 5. update out_indices and rulebook by unique_value_ptr
+  const int64_t sparse_dim = 4;
+  DenseTensorMeta indices_meta(
+      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(
+      x.dtype(), {out_non_zero_num, kernel_sizes[4]}, x.layout());
+  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+
+  int* out_indices_ptr = out_indices.data<int>();
+
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
+  UpdateIndexKernel<int><<<config.block_per_grid.x,
+                           config.thread_per_block.x,
+                           0,
+                           dev_ctx.stream()>>>(unique_key_ptr,
+                                               unique_value_ptr,
+                                               out_index_ptr,
+                                               out_non_zero_num,
+                                               rulebook_len,
+                                               d_out_dims,
+                                               out_indices_ptr,
+                                               rulebook_ptr + 2 * rulebook_len);
+  out->SetMember(out_indices, out_values, out_dims, true);
+  return rulebook_len;
+}
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index d6d992d0f4b65..4db0a0b0011b5 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -38,7 +38,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const SparseCooTensor& x,
                       const DenseTensor& rulebook,
                       const DenseTensor& kernel,
-                      const SparseCooTensor& out_grad,
+                      const DenseTensor& out_grad,
                       const std::vector<int>& paddings,
                       const std::vector<int>& dilations,
                       const std::vector<int>& strides,
@@ -140,12 +140,11 @@ void Conv3dGradKernel(const Context& dev_ctx,
   GatherKernel<T, int><<<config.block_per_grid.x,
                          config.thread_per_block.x,
                          0,
-                         dev_ctx.stream()>>>(
-      out_grad.non_zero_elements().data<T>(),
-      rulebook_ptr + rulebook_len * 2,
-      out_grad_features_ptr,
-      rulebook_len,
-      out_channels);
+                         dev_ctx.stream()>>>(out_grad.data<T>(),
+                                             rulebook_ptr + rulebook_len * 2,
+                                             out_grad_features_ptr,
+                                             rulebook_len,
+                                             out_channels);
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 1a0c7e9b97214..214e689e9370a 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -12,515 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thrust/execution_policy.h>
-#include <thrust/remove.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-
-#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/index_impl.cu.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
 namespace phi {
 namespace sparse {
 
-using Dims4D = phi::funcs::sparse::Dims4D;
-
-__global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
-                                              const int n,
-                                              const int rulebook_len,
-                                              const int kernel_size,
-                                              int* rulebook_ptr,
-                                              int* counter_ptr) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  extern __shared__ int cache_count[];  // kernel_size
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    cache_count[i] = 0;
-  }
-  __syncthreads();
-
-  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
-    int index = indexs[i];
-    int kernel_index = rulebook_ptr[index];
-    rulebook_ptr[index + rulebook_len] = -1;
-    rulebook_ptr[index + 2 * rulebook_len] = -1;
-    rulebook_ptr[index] = -1;
-    atomicAdd(&cache_count[kernel_index], 1);
-  }
-  __syncthreads();
-
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    atomicSub(&counter_ptr[i], cache_count[i]);
-  }
-}
-
-/**
- * @brief: update the out index and indices
- * unique_keys: save the index of the output feature list
- * unique_values: indiates the index of key before deduplication
- * out_indexs: indicates the position of the output index in the rulebook
- * rulebook_len: indicates the length of rulebook
- * out_dims: indicates the output dims
- * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
- * rulebook_out_indexs: the output index in rulebook
-**/
-__global__ void UpdateIndexKernel(const int* unique_keys,
-                                  const int* unique_values,
-                                  const int* out_indexs,
-                                  const int non_zero_num,
-                                  const int rulebook_len,
-                                  const Dims4D out_dims,
-                                  int* out_indices,
-                                  int* rulebook_out_indexs) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
-    const int index = unique_keys[i];
-    int batch, x, y, z;
-    phi::funcs::sparse::IndexToPoint<Dims4D>(
-        index, out_dims, &batch, &x, &y, &z);
-    // get out indices
-    out_indices[i] = batch;
-    out_indices[i + non_zero_num] = z;
-    out_indices[i + non_zero_num * 2] = y;
-    out_indices[i + non_zero_num * 3] = x;
-
-    // update rulebook
-    int start = unique_values[i];
-    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
-    // max(end-start) = kernel_size
-    for (int j = start; j < end; j++) {
-      rulebook_out_indexs[out_indexs[j]] = i;
-    }
-  }
-}
-
-/**
- * @brief product rulebook
- * for input_i in x_indices:
- *   if input_i participate in the convolution calculation:
- *       infer the output_i by input_i and kernel_i
- *       save output_i
- *
- * x_indices: the indices of input features
- * x_dims: the input dims
- * kernel_dims: the kernel dims
- * out_dims: the output dims
- * non_zero_num: the number of input features
- * rulebook: the rulebook to save the kernel index, input index and output index
- * counter: save the number of times each location in the kernel participates in
- *the caculation
-**/
-__global__ void ProductRuleBookKernel(const int* x_indices,
-                                      const Dims4D x_dims,
-                                      const Dims4D kernel_dims,
-                                      const Dims4D out_dims,
-                                      const int64_t non_zero_num,
-                                      const Dims4D paddings,
-                                      const Dims4D dilations,
-                                      const Dims4D strides,
-                                      const bool subm,
-                                      int* rulebook,
-                                      int* counter,
-                                      int* in_indexs) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  extern __shared__ int counter_buf[];  // kernel_size
-  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
-  const int offset = kernel_size * non_zero_num;
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    counter_buf[i] = 0;
-  }
-  __syncthreads();
-
-  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
-    int kernel_index = 0;
-    int batch = x_indices[i];
-    int in_z = x_indices[i + non_zero_num];
-    int in_y = x_indices[i + 2 * non_zero_num];
-    int in_x = x_indices[i + 3 * non_zero_num];
-    if (subm) {
-      in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
-    }
-    for (int kz = 0; kz < kernel_dims[1]; kz++) {
-      for (int ky = 0; ky < kernel_dims[2]; ky++) {
-        for (int kx = 0; kx < kernel_dims[3]; kx++) {
-          int in_i = -1, out_index = -1, kernel_i = -1;
-          if (phi::funcs::sparse::Check(x_dims,
-                                        kernel_dims,
-                                        paddings,
-                                        dilations,
-                                        strides,
-                                        in_x,
-                                        in_y,
-                                        in_z,
-                                        kx,
-                                        ky,
-                                        kz)) {
-            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
-            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
-            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
-            in_i = i;
-            out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
-                batch, out_x, out_y, out_z, out_dims);
-            atomicAdd(&counter_buf[kernel_index], 1);
-            kernel_i = kernel_index;
-          }
-          rulebook[kernel_index * non_zero_num + i] = kernel_i;
-          rulebook[kernel_index * non_zero_num + offset + i] = in_i;
-          rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index;
-          ++kernel_index;
-        }
-      }
-    }
-  }
-  __syncthreads();
-  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
-    atomicAdd(&counter[i], counter_buf[i]);
-  }
-}
-
-// brief: calculation the distance between start and end
-__global__ void DistanceKernel(const int* start,
-                               const int* end,
-                               int* distance) {
-  if (threadIdx.x == 0) {
-    *distance = end - start;
-  }
-}
-
-// the basic algorithm can refer to convolution_kernel.cc or
-// the second paper
-// example:
-// 1. the rulebook:
-//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
-//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
-// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
-// 3. sorted the (key, value)
-// 4. unique the (key, value):
-//  unique_key:     20, 25, 30, 33
-//  unique_values:  0, 2, 3, 5
-//  the index of unique_values is: 0, 1, 2, 3
-// 5. update the out_index by unique_key, uniqe_value and the index of
-// unique_value:
-//  the new out_index: 0, 2, 3, 2, 3, 0, 1
-template <typename T, typename Context>
-int ProductRuleBook(const Context& dev_ctx,
-                    const SparseCooTensor& x,
-                    const DenseTensor& kernel,
-                    const std::vector<int>& paddings,
-                    const std::vector<int>& dilations,
-                    const std::vector<int>& strides,
-                    const DDim& out_dims,
-                    const bool subm,
-                    DenseTensor* rulebook,
-                    DenseTensor* counter_per_kernel,
-                    DenseTensor* offsets_per_kernel,
-                    DenseTensor* out_index,
-                    DenseTensor* unique_key,
-                    DenseTensor* unique_value,
-                    SparseCooTensor* out,
-                    std::vector<int>* h_counter,
-                    std::vector<int>* h_offsets) {
-  const auto& kernel_dims = kernel.dims();
-  const int64_t non_zero_num = x.nnz();
-  const auto& non_zero_indices = x.non_zero_indices();
-  const int* indices_ptr = non_zero_indices.data<int>();
-  DenseTensor in_indexs = phi::Empty<Context>(
-      dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
-  int* counter_ptr = counter_per_kernel->data<int>();
-  int* offsets_ptr = offsets_per_kernel->data<int>();
-  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
-  const int rulebook_rows = 3;
-  const int rulebook_cols = kernel_size * non_zero_num;
-  rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols});
-  int* rulebook_ptr = rulebook->data<int>();
-
-  const auto x_dims = x.dims();
-  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
-  Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
-  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
-  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
-  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
-  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
-
-  // 1. product rule book
-  phi::funcs::SetConstant<Context, int> set_zero;
-  set_zero(dev_ctx, counter_per_kernel, 0);
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
-
-  ProductRuleBookKernel<<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          kernel_size * sizeof(int),
-                          dev_ctx.stream()>>>(indices_ptr,
-                                              d_x_dims,
-                                              d_kernel_dims,
-                                              d_out_dims,
-                                              non_zero_num,
-                                              d_paddings,
-                                              d_dilations,
-                                              d_strides,
-                                              subm,
-                                              rulebook_ptr,
-                                              counter_ptr,
-                                              in_indexs.data<int>());
-
-// 2. remove -1
-#ifdef PADDLE_WITH_HIP
-  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                             rulebook_ptr,
-                             rulebook_ptr + rulebook_rows * rulebook_cols,
-                             -1);
-
-  DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
-      rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
-  int rulebook_len = 0;
-  phi::backends::gpu::GpuMemcpyAsync(
-      &rulebook_len,
-      rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
-      sizeof(int),
-#ifdef PADDLE_WITH_HIP
-      hipMemcpyDeviceToHost,
-#else
-      cudaMemcpyDeviceToHost,
-#endif
-      dev_ctx.stream());
-  rulebook_len /= 3;
-  dev_ctx.Wait();
-
-  if (subm) {
-    // At present, hashtable is not used to map the input and output indexes.
-    // At present, the intermediate output index is generated by normal
-    // convolution,
-    // and then the intermediate output index is subtracted from the input index
-    // to obain the rulebook.
-    // get difference
-    int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
-    int32_t* B_key_ptr = in_indexs.data<int>();
-    DenseTensor A_val = phi::Empty<Context>(
-        dev_ctx,
-        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
-    DenseTensor B_val = phi::Empty<Context>(
-        dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
-    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
-        dev_ctx, &A_val, kps::IdentityFunctor<int>());
-    phi::IndexKernel<int, kps::IdentityFunctor<int>>(
-        dev_ctx, &B_val, kps::IdentityFunctor<int>());
-    DenseTensor key_result = phi::Empty<Context>(
-        dev_ctx,
-        DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW));
-    DenseTensor val_result = phi::Empty<Context>(
-        dev_ctx,
-        DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
-
-#ifdef PADDLE_WITH_HIP
-    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
-#else
-    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                           counter_ptr,
-                           counter_ptr + kernel_size,
-                           offsets_ptr);
-    std::vector<int> offsets(kernel_size, 0);
-    // TODO(zhangkaihuo): used unified memcpy interface
-    phi::backends::gpu::GpuMemcpyAsync(offsets.data(),
-                                       offsets_ptr,
-                                       kernel_size * sizeof(int),
-#ifdef PADDLE_WITH_HIP
-                                       hipMemcpyDeviceToHost,
-#else
-                                       cudaMemcpyDeviceToHost,
-#endif
-                                       dev_ctx.stream());
-    dev_ctx.Wait();
-
-    thrust::pair<int*, int*> end;
-    // Because set_diff does not support duplicate data, set_diff is performed
-    // separately for each segment of data.
-    // TODO(zhangkaihuo): Using hashtable here may get better performance,
-    // further tests ared needed.
-    for (int i = 0; i < kernel_size; i++) {
-      int start = offsets[i];
-      int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
-      int* key_result_start = (i == 0 ? key_result.data<int>() : end.first);
-      int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
-      end =
-#ifdef PADDLE_WITH_HIP
-          thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()),
-#else
-          thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                                        A_key_ptr + start,
-                                        A_key_ptr + stop,
-                                        B_key_ptr,
-                                        B_key_ptr + x.nnz(),
-                                        A_val.data<int>() + start,
-                                        B_val.data<int>(),
-                                        key_result_start,
-                                        val_result_start);
-    }
-
-    DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
-        key_result.data<int>(),
-        end.first,
-        key_result.data<int>() + rulebook_len);
-    int len = 0;
-    phi::backends::gpu::GpuMemcpyAsync(&len,
-                                       key_result.data<int>() + rulebook_len,
-                                       sizeof(int),
-#ifdef PADDLE_WITH_HIP
-                                       hipMemcpyDeviceToHost,
-#else
-                                       cudaMemcpyDeviceToHost,
-#endif
-                                       dev_ctx.stream());
-    dev_ctx.Wait();
-    // set the diff value = -1, and update counter
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
-    SetFlagAndUpdateCounterKernel<<<config.block_per_grid.x,
-                                    config.thread_per_block,
-                                    kernel_size * sizeof(int),
-                                    dev_ctx.stream()>>>(val_result.data<int>(),
-                                                        len,
-                                                        rulebook_len,
-                                                        kernel_size,
-                                                        rulebook_ptr,
-                                                        counter_ptr);
-// remove -1
-#ifdef PADDLE_WITH_HIP
-    int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
-#else
-    int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                               rulebook_ptr,
-                               rulebook_ptr + 3 * rulebook_len,
-                               -1);
-    DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
-        rulebook_ptr, last, key_result.data<int>() + rulebook_len);
-    phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
-                                       key_result.data<int>() + rulebook_len,
-                                       sizeof(int),
-#ifdef PADDLE_WITH_HIP
-                                       hipMemcpyDeviceToHost,
-#else
-                                       cudaMemcpyDeviceToHost,
-#endif
-                                       dev_ctx.stream());
-    dev_ctx.Wait();
-    rulebook_len /= 3;
-  }
-
-#ifdef PADDLE_WITH_HIP
-  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                         counter_ptr,
-                         counter_ptr + kernel_size,
-                         offsets_ptr);
-
-#ifdef PADDLE_WITH_HIP
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
-                                     hipMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
-                                     hipMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-#else
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
-                                     cudaMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
-                                     cudaMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-#endif
-  rulebook->Resize({rulebook_rows, rulebook_len});
-
-  // 3. sorted or merge the out index
-  out_index->ResizeAndAllocate({rulebook_len});
-  unique_value->ResizeAndAllocate({rulebook_len});
-  unique_key->ResizeAndAllocate({rulebook_len});
-  int* out_index_ptr = out_index->data<int>();
-  int* unique_value_ptr = unique_value->data<int>();
-  int* unique_key_ptr = unique_key->data<int>();
-
-  int* new_end = SortedAndUniqueIndex(dev_ctx,
-                                      rulebook_ptr + 2 * rulebook_len,
-                                      rulebook_len,
-                                      out_index,
-                                      unique_key,
-                                      unique_value);
-  // thrust::distance doesn't support stream parameters
-  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
-  // new_end.first);
-  DistanceKernel<<<1, 1>>>(unique_key_ptr,
-                           new_end,
-                           rulebook_ptr + rulebook_rows * rulebook_cols - 1);
-  int out_non_zero_num = 0;
-#ifdef PADDLE_WITH_HIP
-  phi::backends::gpu::GpuMemcpyAsync(
-      &out_non_zero_num,
-      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-      sizeof(int),
-      hipMemcpyDeviceToHost,
-      dev_ctx.stream());
-#else
-  phi::backends::gpu::GpuMemcpyAsync(
-      &out_non_zero_num,
-      rulebook_ptr + rulebook_rows * rulebook_cols - 1,
-      sizeof(int),
-      cudaMemcpyDeviceToHost,
-      dev_ctx.stream());
-#endif
-  dev_ctx.Wait();
-
-  // 5. update out_indices and rulebook by unique_value_ptr
-  const int64_t sparse_dim = 4;
-  DenseTensorMeta indices_meta(
-      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
-  DenseTensorMeta values_meta(
-      x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
-  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
-  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
-
-  int* out_indices_ptr = out_indices.data<int>();
-
-  config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
-  UpdateIndexKernel<<<config.block_per_grid.x,
-                      config.thread_per_block.x,
-                      0,
-                      dev_ctx.stream()>>>(unique_key_ptr,
-                                          unique_value_ptr,
-                                          out_index_ptr,
-                                          out_non_zero_num,
-                                          rulebook_len,
-                                          d_out_dims,
-                                          out_indices_ptr,
-                                          rulebook_ptr + 2 * rulebook_len);
-  out->SetMember(out_indices, out_values, out_dims, true);
-  return rulebook_len;
-}
-
 /**
  * x: (N, D, H, W, C)
  * kernel: (D, H, W, C, OC)
@@ -545,9 +46,12 @@ void Conv3dKernel(const Context& dev_ctx,
   const auto& kernel_dims = kernel.dims();
   int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   DDim out_dims = {1, 1, 1, 1, 1};
+  std::vector<int> kernel_sizes(kernel_dims.size());
+  for (int i = 0; i < kernel_dims.size(); i++) {
+    kernel_sizes[i] = kernel_dims[i];
+  }
   phi::funcs::sparse::GetOutShape(
-      x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
-  out->set_dims(out_dims);
+      x_dims, kernel_sizes, paddings, dilations, strides, &out_dims);
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
   std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
@@ -574,7 +78,7 @@ void Conv3dKernel(const Context& dev_ctx,
 
   int n = ProductRuleBook<T, Context>(dev_ctx,
                                       x,
-                                      kernel,
+                                      kernel_sizes,
                                       subm_paddings,
                                       dilations,
                                       subm_strides,
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
new file mode 100644
index 0000000000000..1048dd1be0c01
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
@@ -0,0 +1,120 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+
+#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T>
+__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
+                                      const T* out_features_ptr,
+                                      const T* out_grad_ptr,
+                                      const int* rulebook_ptr,
+                                      const int n,
+                                      const int rulebook_len,
+                                      const int channels,
+                                      T* x_grad_ptr) {
+  phi::funcs::MaxPoolGrad<T> grad_functor;
+  CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
+    int real_i = i / channels;
+    int c = i - real_i * channels;
+    int in_i = rulebook_ptr[real_i];
+    int out_i = rulebook_ptr[real_i + rulebook_len];
+    grad_functor.compute(in_features_ptr[in_i * channels + c],
+                         out_features_ptr[out_i * channels + c],
+                         out_grad_ptr[out_i * channels + c],
+                         1,
+                         &x_grad_ptr[in_i * channels + c]);
+  }
+}
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const SparseCooTensor& out,
+                       const DenseTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       DenseTensor* x_grad) {
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const int in_channels = x.dims()[4];
+  int rulebook_len = rulebook.dims()[1];
+  const int* rulebook_ptr = rulebook.data<int>();
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
+      h_counter(kernel_size);
+  phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
+                                     rulebook_ptr,
+                                     rulebook_len * sizeof(int),
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToHost,
+#else
+                                     cudaMemcpyDeviceToHost,
+#endif
+
+                                     dev_ctx.stream());
+  dev_ctx.Wait();
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[h_counter[i]] += 1;
+  }
+  phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size);
+
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+  const T* out_features_ptr = out.non_zero_elements().data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = x_grad->data<T>();
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0.0f));
+
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0) {
+      continue;
+    }
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, counter[i] * in_channels, 1);
+    MaxPoolGradCudaKernel<T><<<config.block_per_grid.x,
+                               config.thread_per_block.x,
+                               0,
+                               dev_ctx.stream()>>>(
+        in_features_ptr,
+        out_features_ptr,
+        out_grad_ptr,
+        rulebook_ptr + offsets[i] + rulebook_len,
+        counter[i],
+        rulebook_len,
+        in_channels,
+        x_grad_ptr);
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
new file mode 100644
index 0000000000000..0f6a0d13b1ddb
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
@@ -0,0 +1,140 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T>
+__global__ void MaxPoolCudaKernel(const T* in_features_ptr,
+                                  const int* rulebook_ptr,
+                                  const int n,
+                                  const int rulebook_len,
+                                  const int channels,
+                                  T* out_features_ptr) {
+  phi::funcs::MaxPool<T> max_pool_functor;
+  CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
+    int real_i = i / channels;
+    int channel_i = i - real_i * channels;
+    int in_i = rulebook_ptr[real_i];
+    int out_i = rulebook_ptr[real_i + rulebook_len];
+    max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i],
+                             &out_features_ptr[out_i * channels + channel_i]);
+  }
+}
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook) {
+  const auto& x_dims = x.dims();
+  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
+  const std::vector<int>& real_kernel_sizes =
+      phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
+  DDim out_dims = {1, 1, 1, 1, 1};
+  phi::funcs::sparse::GetOutShape(
+      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
+  const int in_channels = real_kernel_sizes[3];
+
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size);
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
+  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
+  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
+
+  // 1. product rulebook
+  int rulebook_len = ProductRuleBook<T, Context>(dev_ctx,
+                                                 x,
+                                                 real_kernel_sizes,
+                                                 paddings,
+                                                 dilations,
+                                                 strides,
+                                                 out_dims,
+                                                 false,
+                                                 rulebook,
+                                                 &counter_per_kernel,
+                                                 &offsets_per_kernel,
+                                                 &out_index,
+                                                 &unique_key,
+                                                 &unique_value,
+                                                 out,
+                                                 &counter,
+                                                 &offsets);
+
+  const int* rulebook_ptr = rulebook->data<int>();
+
+  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
+  const T* in_features_ptr = x.non_zero_elements().data<T>();
+// 2. max pool
+#ifdef PADDLE_WITH_HIP
+  thrust::fill(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+               out_features_ptr,
+               out_features_ptr + out->non_zero_elements().numel(),
+               static_cast<T>(-FLT_MAX));
+  // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0) {
+      continue;
+    }
+
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        dev_ctx, counter[i] * in_channels, 1);
+    MaxPoolCudaKernel<T><<<config.block_per_grid.x,
+                           config.thread_per_block.x,
+                           0,
+                           dev_ctx.stream()>>>(
+        in_features_ptr,
+        rulebook_ptr + offsets[i] + rulebook_len,
+        counter[i],
+        rulebook_len,
+        in_channels,
+        out_features_ptr);
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_maxpool,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MaxPoolKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
new file mode 100644
index 0000000000000..572ade76281bc
--- /dev/null
+++ b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaxPoolGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const DenseTensor& rulebook,
+                       const SparseCooTensor& out,
+                       const DenseTensor& out_grad,
+                       const std::vector<int>& kernel_sizes,
+                       DenseTensor* x_grad);
+
+template <typename T, typename Context>
+DenseTensor MaxPoolGrad(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const DenseTensor& rulebook,
+                        const SparseCooTensor& out,
+                        const DenseTensor& out_grad,
+                        const std::vector<int>& kernel_sizes) {
+  DenseTensor x_grad = phi::Empty<Context>(
+      dev_ctx,
+      DenseTensorMeta(x.dtype(), x.non_zero_elements().dims(), x.layout()));
+  MaxPoolGradKernel<T, Context>(
+      dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad);
+  return x_grad;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_pool_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_kernel.h
new file mode 100644
index 0000000000000..bfadbf72e300f
--- /dev/null
+++ b/paddle/phi/kernels/sparse/sparse_pool_kernel.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MaxPoolKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   const std::vector<int>& kernel_sizes,
+                   const std::vector<int>& paddings,
+                   const std::vector<int>& dilations,
+                   const std::vector<int>& strides,
+                   SparseCooTensor* out,
+                   DenseTensor* rulebook);
+
+template <typename T, typename Context>
+SparseCooTensor MaxPool(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const std::vector<int>& kernel_sizes,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        DenseTensor* rulebook) {
+  DenseTensor indices = phi::Empty<Context>(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  DenseTensor values =
+      phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
+  SparseCooTensor coo(indices, values, x.dims());
+  MaxPoolKernel<T, Context>(
+      dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse_weight_embedding_grad_kernel.h b/paddle/phi/kernels/sparse_weight_embedding_grad_kernel.h
new file mode 100644
index 0000000000000..772268c2cc388
--- /dev/null
+++ b/paddle/phi/kernels/sparse_weight_embedding_grad_kernel.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SparseWeightEmbeddingGradKernel(const Context& ctx,
+                                     const DenseTensor& input,
+                                     const SelectedRows& weight,
+                                     const DenseTensor& out_grad,
+                                     int64_t padding_idx,
+                                     DenseTensor* weight_grad);
+
+template <typename T, typename Context>
+void SparseWeightEmbeddingSparseGradKernel(const Context& ctx,
+                                           const DenseTensor& input,
+                                           const SelectedRows& weight,
+                                           const DenseTensor& out_grad,
+                                           int64_t padding_idx,
+                                           SelectedRows* weight_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse_weight_embedding_kernel.h b/paddle/phi/kernels/sparse_weight_embedding_kernel.h
new file mode 100644
index 0000000000000..c7392b691aa0f
--- /dev/null
+++ b/paddle/phi/kernels/sparse_weight_embedding_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SparseWeightEmbeddingKernel(const Context& ctx,
+                                 const DenseTensor& inputx,
+                                 const SelectedRows& weight,
+                                 int64_t padding_idx,
+                                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/squeeze_grad_kernel.h b/paddle/phi/kernels/squeeze_grad_kernel.h
new file mode 100644
index 0000000000000..52b02bdbb9529
--- /dev/null
+++ b/paddle/phi/kernels/squeeze_grad_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SqueezeGradKernel(const Context& dev_ctx,
+                       const DenseTensor& xshape,
+                       const DenseTensor& dout,
+                       const std::vector<int>& axes,
+                       DenseTensor* dx);
+}  // namespace phi
diff --git a/paddle/phi/kernels/squeeze_kernel.h b/paddle/phi/kernels/squeeze_kernel.h
new file mode 100644
index 0000000000000..22254eacfcefc
--- /dev/null
+++ b/paddle/phi/kernels/squeeze_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SqueezeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int>& axes,
+                   DenseTensor* xshape,
+                   DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/tril_triu_grad_kernel.h b/paddle/phi/kernels/tril_triu_grad_kernel.h
new file mode 100644
index 0000000000000..10faf5c48d5bf
--- /dev/null
+++ b/paddle/phi/kernels/tril_triu_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuGradKernel(const Context& ctx,
+                        const DenseTensor& out_grad,
+                        int diagonal,
+                        bool lower,
+                        DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/tril_triu_kernel.h b/paddle/phi/kernels/tril_triu_kernel.h
new file mode 100644
index 0000000000000..4daa84e25c373
--- /dev/null
+++ b/paddle/phi/kernels/tril_triu_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilTriuKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    int diagonal,
+                    bool lower,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.h b/paddle/phi/kernels/unsqueeze_grad_kernel.h
new file mode 100644
index 0000000000000..0c5afe7be6039
--- /dev/null
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.h
@@ -0,0 +1,27 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnsqueezeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x_shape,
+                         const DenseTensor& dout,
+                         DenseTensor* dx);
+}  // namespace phi
diff --git a/paddle/phi/kernels/unsqueeze_kernel.h b/paddle/phi/kernels/unsqueeze_kernel.h
new file mode 100644
index 0000000000000..8f818a1b49042
--- /dev/null
+++ b/paddle/phi/kernels/unsqueeze_kernel.h
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnsqueezeKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const ScalarArray& axes,
+                     DenseTensor* xshape,
+                     DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
index 396830ca20765..8b4884e35b608 100644
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -16,40 +16,135 @@ limitations under the License. */
 
 namespace phi {
 
-#define DefineActGradDepXOpArgMap(func_name, op_name)                        \
-  KernelSignature func_name##GradOpArgumentMapping(                          \
-      const ArgumentMappingContext& ctx) {                                   \
-    return KernelSignature(                                                  \
-        op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \
+#define DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(func_name, op_name, attrs) \
+  KernelSignature func_name##GradOpArgumentMapping(               \
+      const ArgumentMappingContext& ctx) {                        \
+    return KernelSignature(op_name "_grad",                       \
+                           {"X", GradVarName("Out")},             \
+                           {attrs},                               \
+                           {GradVarName("X")});                   \
   }
 
-#define DefineActGradDepOutOpArgMap(func_name, op_name)                        \
-  KernelSignature func_name##GradOpArgumentMapping(                            \
-      const ArgumentMappingContext& ctx) {                                     \
-    return KernelSignature(                                                    \
-        op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \
+#define DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(func_name, op_name, attrs) \
+  KernelSignature func_name##GradOpArgumentMapping(                 \
+      const ArgumentMappingContext& ctx) {                          \
+    return KernelSignature(op_name "_grad",                         \
+                           {"Out", GradVarName("Out")},             \
+                           {attrs},                                 \
+                           {GradVarName("X")});                     \
   }
 
+#define comma ,
+
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", );      // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", );    // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu,
+                               "thresholded_relu",
+                               "threshold");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", );  // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", );               // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", );   // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log, "log", );                 // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log2, "log2", );               // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log10, "log10", );             // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log1p, "log1p", );             // NOLINT
+
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );        // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", );  // NOLINT
+DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid,
+                                 "hard_sigmoid",
+                                 "slope" comma "offset");  // NOLINT
+
 KernelSignature ReluDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"});
 }
 
-DefineActGradDepXOpArgMap(Cos, "cos");
-DefineActGradDepXOpArgMap(Tan, "tan");
-DefineActGradDepXOpArgMap(Acos, "acos");
-DefineActGradDepXOpArgMap(Sin, "sin");
-DefineActGradDepXOpArgMap(Asin, "asin");
-DefineActGradDepXOpArgMap(Atan, "atan");
-DefineActGradDepXOpArgMap(Sinh, "sinh");
-DefineActGradDepXOpArgMap(Cosh, "cosh");
-DefineActGradDepXOpArgMap(Asinh, "asinh");
-DefineActGradDepXOpArgMap(Acosh, "acosh");
-DefineActGradDepXOpArgMap(Atanh, "atanh");
-DefineActGradDepOutOpArgMap(Relu, "relu");
+KernelSignature TanhDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
+}
+
+KernelSignature TanhTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("tanh_triple_grad",
+                         {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
+                         {},
+                         {"D_OutNew", "D_DOut", "D_DDx"});
+}
+
+KernelSignature SigmoidDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "sigmoid_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
+}
+
+KernelSignature SigmoidTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("sigmoid_triple_grad",
+                         {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
+                         {},
+                         {"D_OutNew", "D_DOut", "D_DDx"});
+}
+
+KernelSignature LeakyReluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "leaky_relu_double_grad", {"X", "DDX"}, {"alpha"}, {"DDOut"});
+}
+
+KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"});
+}
+
+KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"});
+}
+
+KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("elu_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"alpha"},
+                         {GradVarName("X")});
+}
+
+KernelSignature EluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "elu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"});
+}
+
+KernelSignature LogDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "log_double_grad", {"X", "DOut", "DDX"}, {}, {"DX", "DDOut"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink);
+PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(log_grad_grad, log_double_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
@@ -65,3 +160,40 @@ PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad,
                            phi::ReluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad,
+                           phi::TanhDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad,
+                           phi::TanhTripleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::BReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad,
+                           phi::LeakyReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad,
+                           phi::LeakyReluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad,
+                           phi::ThresholdedReluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(softshrink_grad,
+                           phi::SoftShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad,
+                           phi::HardShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tanh_shrink_grad,
+                           phi::TanhShrinkGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad,
+                           phi::SigmoidDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sigmoid_triple_grad,
+                           phi::SigmoidTripleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad,
+                           phi::LogSigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad,
+                           phi::HardSigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log_grad, phi::LogGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log_grad_grad, phi::LogDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log2_grad, phi::Log2GradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log10_grad, phi::Log10GradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log1p_grad, phi::Log1pGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/assign_sig.cc b/paddle/phi/ops/compat/assign_sig.cc
new file mode 100644
index 0000000000000..d149e8e6a9aa0
--- /dev/null
+++ b/paddle/phi/ops/compat/assign_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature AssignOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("X")) {
+    if (ctx.IsDenseTensorVectorInput("X")) {
+      return KernelSignature("assign_array", {"X"}, {}, {"Out"});
+    } else if (ctx.IsSelectedRowsInput("X")) {
+      return KernelSignature("assign_sr", {"X"}, {}, {"Out"});
+    } else {
+      return KernelSignature("assign", {"X"}, {}, {"Out"});
+    }
+  } else {
+    return KernelSignature("assign", {"X"}, {}, {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(assign, phi::AssignOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc
index 011d4c12ecefc..803bb50b438a5 100644
--- a/paddle/phi/ops/compat/batch_norm_sig.cc
+++ b/paddle/phi/ops/compat/batch_norm_sig.cc
@@ -17,21 +17,42 @@
 namespace phi {
 
 KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("batch_norm",
-                         {"X", "Scale", "Bias", "Mean", "Variance"},
-                         {"momentum",
-                          "epsilon",
-                          "data_layout",
-                          "is_test",
-                          "use_global_stats",
-                          "trainable_statistics",
-                          "fuse_with_relu"},
-                         {"Y",
-                          "MeanOut",
-                          "VarianceOut",
-                          "SavedMean",
-                          "SavedVariance",
-                          "ReserveSpace"});
+  bool is_test = paddle::any_cast<bool>(ctx.Attr("is_test"));
+  bool use_global_stats =
+      ctx.HasAttr("use_global_stats")
+          ? paddle::any_cast<bool>(ctx.Attr("use_global_stats"))
+          : false;
+  bool trainable_statistics =
+      ctx.HasAttr("trainable_statistics")
+          ? paddle::any_cast<bool>(ctx.Attr("trainable_statistics"))
+          : false;
+  bool fuse_with_relu = ctx.HasAttr("fuse_with_relu")
+                            ? paddle::any_cast<bool>(ctx.Attr("fuse_with_relu"))
+                            : false;
+  // Dispenable `MomentumTensor` is useless now
+  if (is_test && !use_global_stats && !trainable_statistics &&
+      !fuse_with_relu) {
+    return KernelSignature("batch_norm_infer",
+                           {"X", "Scale", "Bias", "Mean", "Variance"},
+                           {"momentum", "epsilon", "data_layout"},
+                           {"Y", "MeanOut", "VarianceOut"});
+  } else {
+    return KernelSignature("batch_norm",
+                           {"X", "Scale", "Bias", "Mean", "Variance"},
+                           {"momentum",
+                            "epsilon",
+                            "data_layout",
+                            "is_test",
+                            "use_global_stats",
+                            "trainable_statistics",
+                            "fuse_with_relu"},
+                           {"Y",
+                            "MeanOut",
+                            "VarianceOut",
+                            "SavedMean",
+                            "SavedVariance",
+                            "ReserveSpace"});
+  }
 }
 
 KernelSignature BatchNormGradOpArgumentMapping(
diff --git a/paddle/phi/ops/compat/conv2d_sig.cc b/paddle/phi/ops/compat/conv2d_sig.cc
index a755fdb19ec4b..67b99f1dd619c 100644
--- a/paddle/phi/ops/compat/conv2d_sig.cc
+++ b/paddle/phi/ops/compat/conv2d_sig.cc
@@ -17,18 +17,31 @@
 namespace phi {
 
 KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("conv2d",
-                         {"Input", "Filter"},
-                         {"strides",
-                          "paddings",
-                          "padding_algorithm",
-                          "groups",
-                          "dilations",
-                          "data_format",
-                          "use_addto",
-                          "workspace_size_MB",
-                          "exhaustive_search"},
-                         {"Output"});
+  if (!ctx.HasAttr("use_addto") || !ctx.HasAttr("workspace_size_MB") ||
+      !ctx.HasAttr("exhaustive_search")) {
+    return KernelSignature("conv2d_infer",
+                           {"Input", "Filter"},
+                           {"strides",
+                            "paddings",
+                            "padding_algorithm",
+                            "groups",
+                            "dilations",
+                            "data_format"},
+                           {"Output"});
+  } else {
+    return KernelSignature("conv2d",
+                           {"Input", "Filter"},
+                           {"strides",
+                            "paddings",
+                            "padding_algorithm",
+                            "groups",
+                            "dilations",
+                            "data_format",
+                            "use_addto",
+                            "workspace_size_MB",
+                            "exhaustive_search"},
+                           {"Output"});
+  }
 }
 
 KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
diff --git a/paddle/phi/ops/compat/conv_transpose_sig.cc b/paddle/phi/ops/compat/conv_transpose_sig.cc
new file mode 100644
index 0000000000000..8697168b82747
--- /dev/null
+++ b/paddle/phi/ops/compat/conv_transpose_sig.cc
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Conv2dTransposeOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_transpose",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {"Output"});
+}
+
+KernelSignature Conv2dTransposeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_transpose_grad",
+                         {"Input", "Filter", GradVarName("Output")},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature Conv2dTransposeDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_transpose_grad_grad",
+                         {"Input", "Filter", "DOutput", "DDInput", "DDFilter"},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {"DInput", "DFilter", "DDOutput"});
+}
+
+KernelSignature Conv3dTransposeOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d_transpose",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {"Output"});
+}
+
+KernelSignature Conv3dTransposeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d_transpose_grad",
+                         {"Input", "Filter", GradVarName("Output")},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+KernelSignature DepthwiseConv2dTransposeOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_transpose",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {"Output"});
+}
+
+KernelSignature DepthwiseConv2dTransposeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_transpose_grad",
+                         {"Input", "Filter", GradVarName("Output")},
+                         {"strides",
+                          "paddings",
+                          "output_padding",
+                          "output_size",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose,
+                           phi::Conv2dTransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad,
+                           phi::Conv2dTransposeGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad_grad,
+                           phi::Conv2dTransposeDoubleGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose,
+                           phi::Conv3dTransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose_grad,
+                           phi::Conv3dTransposeGradOpArgumentMapping);
+
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose,
+                           phi::DepthwiseConv2dTransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose_grad,
+                           phi::DepthwiseConv2dTransposeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cumprod_sig.cc b/paddle/phi/ops/compat/cumprod_sig.cc
new file mode 100644
index 0000000000000..01084e764ed9e
--- /dev/null
+++ b/paddle/phi/ops/compat/cumprod_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CumprodGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("cumprod_grad",
+                         {"X", "Out", GradVarName("Out")},
+                         {"dim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(cumprod_grad, phi::CumprodGradGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/deformable_conv_sig.cc b/paddle/phi/ops/compat/deformable_conv_sig.cc
new file mode 100644
index 0000000000000..e2a21673634c3
--- /dev/null
+++ b/paddle/phi/ops/compat/deformable_conv_sig.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DeformableConvOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("deformable_conv",
+                         {"Input", "Offset", "Filter", "Mask"},
+                         {"strides",
+                          "paddings",
+                          "dilations",
+                          "deformable_groups",
+                          "groups",
+                          "im2col_step"},
+                         {"Output"});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(deformable_conv,
+                           phi::DeformableConvOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/determinant_sig.cc b/paddle/phi/ops/compat/determinant_sig.cc
new file mode 100644
index 0000000000000..7bcd30ec5d79b
--- /dev/null
+++ b/paddle/phi/ops/compat/determinant_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DeterminantGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("determinant_grad",
+                         {"Input", "Out", GradVarName("Out")},
+                         {},
+                         {GradVarName("Input")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(determinant_grad,
+                           phi::DeterminantGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/embedding_sig.cc b/paddle/phi/ops/compat/embedding_sig.cc
new file mode 100644
index 0000000000000..b79a381dcecc7
--- /dev/null
+++ b/paddle/phi/ops/compat/embedding_sig.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature EmbeddingOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("W")) {
+    return KernelSignature("embedding", {"Ids", "W"}, {"padding_idx"}, {"Out"});
+  } else {
+    return KernelSignature(
+        "sparse_weight_embedding", {"Ids", "W"}, {"padding_idx"}, {"Out"});
+  }
+}
+
+KernelSignature EmbeddingGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("W")) {
+    if ((paddle::any_cast<bool>(ctx.Attr("is_sparse"))) == true) {
+      return KernelSignature("embedding_sparse_grad",
+                             {"Ids", "W", GradVarName("Out")},
+                             {"padding_idx"},
+                             {GradVarName("W")});
+    } else {
+      return KernelSignature("embedding_grad",
+                             {"Ids", "W", GradVarName("Out")},
+                             {"padding_idx"},
+                             {GradVarName("W")});
+    }
+  } else {
+    if ((paddle::any_cast<bool>(ctx.Attr("is_sparse"))) == true) {
+      return KernelSignature("sparse_weight_embedding_sparse_grad",
+                             {"Ids", "W", GradVarName("Out")},
+                             {"padding_idx"},
+                             {GradVarName("W")});
+    } else {
+      return KernelSignature("sparse_weight_embedding_grad",
+                             {"Ids", "W", GradVarName("Out")},
+                             {"padding_idx"},
+                             {GradVarName("W")});
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(lookup_table_v2, embedding);
+PD_REGISTER_BASE_KERNEL_NAME(lookup_table_v2_grad, embedding_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(lookup_table_v2, phi::EmbeddingOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(lookup_table_v2_grad,
+                           phi::EmbeddingGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/fill_constant_batch_size_like_sig.cc b/paddle/phi/ops/compat/fill_constant_batch_size_like_sig.cc
new file mode 100644
index 0000000000000..444c0ec5b16fe
--- /dev/null
+++ b/paddle/phi/ops/compat/fill_constant_batch_size_like_sig.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature FillConstantBatchSizeLikeOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  const auto& str_value = paddle::any_cast<std::string>(ctx.Attr("str_value"));
+  if (str_value.empty()) {
+    return KernelSignature(
+        "full_batch_size_like",
+        {"Input"},
+        {"shape", "value", "dtype", "input_dim_idx", "output_dim_idx"},
+        {"Out"});
+  } else {
+    return KernelSignature(
+        "full_batch_size_like",
+        {"Input"},
+        {"shape", "str_value", "dtype", "input_dim_idx", "output_dim_idx"},
+        {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(fill_constant_batch_size_like,
+                             full_batch_size_like);
+
+PD_REGISTER_ARG_MAPPING_FN(fill_constant_batch_size_like,
+                           phi::FillConstantBatchSizeLikeOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/frobenius_norm_sig.cc b/paddle/phi/ops/compat/frobenius_norm_sig.cc
new file mode 100644
index 0000000000000..c6dc5ad9014ec
--- /dev/null
+++ b/paddle/phi/ops/compat/frobenius_norm_sig.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature FrobeniusNormOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "frobenius_norm", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+}
+
+KernelSignature FrobeniusNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "frobenius_norm_grad",
+      {"X", "Out", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(frobenius_norm, phi::FrobeniusNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(frobenius_norm_grad,
+                           phi::FrobeniusNormGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/gather_sig.cc b/paddle/phi/ops/compat/gather_sig.cc
new file mode 100644
index 0000000000000..6c47bbe48b8ee
--- /dev/null
+++ b/paddle/phi/ops/compat/gather_sig.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GatherOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Axis")) {
+    return KernelSignature("gather", {"X", "Index"}, {"Axis"}, {"Out"});
+  } else {
+    return KernelSignature("gather", {"X", "Index"}, {"axis"}, {"Out"});
+  }
+}
+
+KernelSignature GatherGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Axis")) {
+    return KernelSignature("gather_grad",
+                           {"X", "Index", GradVarName("Out")},
+                           {"Axis", "overwrite"},
+                           {GradVarName("X")});
+  } else {
+    return KernelSignature("gather_grad",
+                           {"X", "Index", GradVarName("Out")},
+                           {"axis", "overwrite"},
+                           {GradVarName("X")});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gather, phi::GatherOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(gather_grad, phi::GatherGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/gelu_sig.cc b/paddle/phi/ops/compat/gelu_sig.cc
new file mode 100644
index 0000000000000..bf4b47bcf5fa9
--- /dev/null
+++ b/paddle/phi/ops/compat/gelu_sig.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GeluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("gelu", {"X"}, {"approximate"}, {"Out"});
+}
+
+KernelSignature GeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("gelu_grad",
+                         {"X", GradVarName("Out")},
+                         {"approximate"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gelu_grad, phi::GeluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(gelu, phi::GeluOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc
index dacb8b25a89f9..fa4da0704c987 100644
--- a/paddle/phi/ops/compat/graph_send_recv_sig.cc
+++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc
@@ -16,6 +16,14 @@ limitations under the License. */
 
 namespace phi {
 
+KernelSignature GraphSendRecvOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("graph_send_recv",
+                         {"X", "Src_index", "Dst_index"},
+                         {"pool_type", "out_size"},
+                         {"Out", "Dst_count"});
+}
+
 KernelSignature GraphSendRecvGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
@@ -27,5 +35,8 @@ KernelSignature GraphSendRecvGradOpArgumentMapping(
 
 }  // namespace phi
 
+PD_REGISTER_ARG_MAPPING_FN(graph_send_recv,
+                           phi::GraphSendRecvOpArgumentMapping);
+
 PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad,
                            phi::GraphSendRecvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/grid_sampler_sig.cc b/paddle/phi/ops/compat/grid_sampler_sig.cc
new file mode 100644
index 0000000000000..b76a9770d4ded
--- /dev/null
+++ b/paddle/phi/ops/compat/grid_sampler_sig.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GridSamplerOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("grid_sample",
+                         {"X", "Grid"},
+                         {"mode", "padding_mode", "align_corners"},
+                         {"Output"});
+}
+
+KernelSignature GridSamplerGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("grid_sample_grad",
+                         {"X", "Grid", GradVarName("Output")},
+                         {"mode", "padding_mode", "align_corners"},
+                         {GradVarName("X"), GradVarName("Grid")});
+}
+
+}  // namespace phi
+
+// use Python API name as kernel name
+PD_REGISTER_BASE_KERNEL_NAME(grid_sampler, grid_sample);
+PD_REGISTER_BASE_KERNEL_NAME(grid_sampler_grad, grid_sample_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(grid_sampler, phi::GridSamplerOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(grid_sampler_grad,
+                           phi::GridSamplerGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
new file mode 100644
index 0000000000000..20183d1a9b066
--- /dev/null
+++ b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature HierarchicalSigmoidOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("hierarchical_sigmoid",
+                         {"X", "W", "Label", "PathTable", "PathCode", "Bias"},
+                         {"num_classes",
+                          "remote_prefetch",
+                          "trainer_id",
+                          "height_sections",
+                          "epmap",
+                          "table_names",
+                          "is_sparse"},
+                         {"Out", "PreOut", "W_Out"});
+}
+
+KernelSignature HierarchicalSigmoidGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorOutput(GradVarName("W"))) {
+    return KernelSignature(
+        "hierarchical_sigmoid_grad",
+        {"X",
+         "W",
+         "Label",
+         "PreOut",
+         GradVarName("Out"),
+         "PathTable",
+         "PathCode",
+         "Bias"},
+        {"num_classes",
+         "remote_prefetch",
+         "trainer_id",
+         "height_sections",
+         "epmap",
+         "table_names",
+         "is_sparse"},
+        {GradVarName("X"), GradVarName("W"), GradVarName("Bias")});
+  } else if (ctx.IsSelectedRowsOutput(GradVarName("W"))) {
+    return KernelSignature(
+        "hierarchical_sigmoid_grad_sr",
+        {"X",
+         "W",
+         "Label",
+         "PreOut",
+         GradVarName("Out"),
+         "PathTable",
+         "PathCode",
+         "Bias"},
+        {"num_classes",
+         "remote_prefetch",
+         "trainer_id",
+         "height_sections",
+         "epmap",
+         "table_names",
+         "is_sparse"},
+        {GradVarName("X"), GradVarName("W"), GradVarName("Bias")});
+  } else {
+    return KernelSignature("unregistered", {}, {}, {});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid,
+                           phi::HierarchicalSigmoidOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid_grad,
+                           phi::HierarchicalSigmoidGradOpArgumentMapping);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/phi/ops/compat/index_select_sig.cc
similarity index 50%
rename from paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
rename to paddle/phi/ops/compat/index_select_sig.cc
index a578c9f7d8108..53eff1bbcd7ed 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu
+++ b/paddle/phi/ops/compat/index_select_sig.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// .part used to speed up nvcc compile
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
+#include "paddle/phi/core/compat/op_utils.h"
 
-template <typename T>
-using CUDAReduceMeanGradKernel =
-    ops::ReduceCudaGradKernel<T, kps::DivideFunctor>;
+namespace phi {
 
-REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<bool>,
-                        CUDAReduceMeanGradKernel<paddle::platform::float16>,
-                        CUDAReduceMeanGradKernel<float>,
-                        CUDAReduceMeanGradKernel<double>);
+KernelSignature IndexSelectGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("index_select_grad",
+                         {"X", "Index", GradVarName("Out")},
+                         {"dim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(index_select_grad,
+                           phi::IndexSelectGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/isclose_sig.cc b/paddle/phi/ops/compat/isclose_sig.cc
new file mode 100644
index 0000000000000..08632e990958d
--- /dev/null
+++ b/paddle/phi/ops/compat/isclose_sig.cc
@@ -0,0 +1,50 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature IscloseOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Rtol")) {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"Rtol", "Atol", "equal_nan"},
+                             {"Out"});
+
+    } else {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"Rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  } else {
+    if (ctx.HasInput("Atol")) {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"rtol", "Atol", "equal_nan"},
+                             {"Out"});
+    } else {
+      return KernelSignature("isclose",
+                             {"Input", "Other"},
+                             {"rtol", "atol", "equal_nan"},
+                             {"Out"});
+    }
+  }
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(isclose, phi::IscloseOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/kldiv_loss_sig.cc b/paddle/phi/ops/compat/kldiv_loss_sig.cc
new file mode 100644
index 0000000000000..22d2f074e9f13
--- /dev/null
+++ b/paddle/phi/ops/compat/kldiv_loss_sig.cc
@@ -0,0 +1,30 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature KLDivLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("kldiv_loss_grad",
+                         {"X", "Target", GradVarName("Loss")},
+                         {"reduction"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(kldiv_loss_grad,
+                           phi::KLDivLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/kthvalue_sig.cc b/paddle/phi/ops/compat/kthvalue_sig.cc
new file mode 100644
index 0000000000000..e59e9de1e4382
--- /dev/null
+++ b/paddle/phi/ops/compat/kthvalue_sig.cc
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature KthvalueGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("kthvalue_grad",
+                         {GradVarName("Out"), "X", "Indices"},
+                         {"k", "axis", "keepdim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(kthvalue_grad, phi::KthvalueGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/layer_norm_sig.cc b/paddle/phi/ops/compat/layer_norm_sig.cc
new file mode 100644
index 0000000000000..17a81e9ec012f
--- /dev/null
+++ b/paddle/phi/ops/compat/layer_norm_sig.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("layer_norm",
+                         {"X", "Scale", "Bias"},
+                         {"epsilon", "begin_norm_axis", "is_test"},
+                         {"Y", "Mean", "Variance"});
+}
+
+KernelSignature LayerNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "layer_norm_grad",
+      {"X", "Mean", "Variance", "Scale", "Bias", GradVarName("Y")},
+      {"epsilon", "begin_norm_axis", "is_test"},
+      {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(layer_norm, phi::LayerNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(layer_norm_grad,
+                           phi::LayerNormGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/lgamma_sig.cc b/paddle/phi/ops/compat/lgamma_sig.cc
new file mode 100644
index 0000000000000..968ad4923ba7b
--- /dev/null
+++ b/paddle/phi/ops/compat/lgamma_sig.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LgammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "lgamma_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(lgamma_grad, phi::LgammaGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/log_softmax_sig.cc b/paddle/phi/ops/compat/log_softmax_sig.cc
new file mode 100644
index 0000000000000..b1ecc6d56768f
--- /dev/null
+++ b/paddle/phi/ops/compat/log_softmax_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LogSoftmaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("log_softmax_grad",
+                         {"Out", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(log_softmax_grad,
+                           phi::LogSoftmaxGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/mode_sig.cc b/paddle/phi/ops/compat/mode_sig.cc
new file mode 100644
index 0000000000000..20994c08aa73c
--- /dev/null
+++ b/paddle/phi/ops/compat/mode_sig.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"});
+}
+
+KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("mode_grad",
+                         {"X", "Indices", GradVarName("Out")},
+                         {"axis", "keepdim"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/multiplex_sig.cc b/paddle/phi/ops/compat/multiplex_sig.cc
new file mode 100644
index 0000000000000..9dab4655d1723
--- /dev/null
+++ b/paddle/phi/ops/compat/multiplex_sig.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MultiplexOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiplex", {"X", "Ids"}, {}, {"Out"});
+}
+
+KernelSignature MultiplexGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "multiplex_grad", {"Ids", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(multiplex, phi::MultiplexOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(multiplex_grad, phi::MultiplexGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/one_hot_sig.cc b/paddle/phi/ops/compat/one_hot_sig.cc
new file mode 100644
index 0000000000000..655969093c889
--- /dev/null
+++ b/paddle/phi/ops/compat/one_hot_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature OneHotOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("depth_tensor")) {
+    return KernelSignature("one_hot_raw",
+                           {"X"},
+                           {"depth_tensor", "dtype", "allow_out_of_range"},
+                           {"Out"});
+  } else {
+    return KernelSignature("one_hot_raw",
+                           {"X"},
+                           {"depth", "dtype", "allow_out_of_range"},
+                           {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(one_hot_v2, one_hot);
+
+PD_REGISTER_ARG_MAPPING_FN(one_hot_v2, phi::OneHotOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/pad3d_sig.cc b/paddle/phi/ops/compat/pad3d_sig.cc
new file mode 100644
index 0000000000000..c43b98fa27e6b
--- /dev/null
+++ b/paddle/phi/ops/compat/pad3d_sig.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature Pad3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Paddings")) {
+    return KernelSignature(
+        "pad3d", {"X"}, {"Paddings", "mode", "value", "data_format"}, {"Out"});
+  }
+
+  return KernelSignature(
+      "pad3d", {"X"}, {"paddings", "mode", "value", "data_format"}, {"Out"});
+}
+
+KernelSignature Pad3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("Paddings")) {
+    return KernelSignature("pad3d_grad",
+                           {"X", GradVarName("Out")},
+                           {"Paddings", "mode", "value", "data_format"},
+                           {GradVarName("X")});
+  }
+  return KernelSignature("pad3d_grad",
+                         {"X", GradVarName("Out")},
+                         {"paddings", "mode", "value", "data_format"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(pad3d, phi::Pad3dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(pad3d_grad, phi::Pad3dGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/prelu_sig.cc b/paddle/phi/ops/compat/prelu_sig.cc
new file mode 100644
index 0000000000000..bd296c5e95318
--- /dev/null
+++ b/paddle/phi/ops/compat/prelu_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("prelu_grad",
+                         {"X", "Alpha", GradVarName("Out")},
+                         {"mode", "data_format"},
+                         {GradVarName("X"), GradVarName("Alpha")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(prelu_grad, phi::PReluGradOpArgumentMapping);
diff --git a/paddle/phi/kernels/impl/shape_kernel_impl.h b/paddle/phi/ops/compat/qr_sig.cc
similarity index 57%
rename from paddle/phi/kernels/impl/shape_kernel_impl.h
rename to paddle/phi/ops/compat/qr_sig.cc
index 982cfb33f6b14..dd424d590ee11 100644
--- a/paddle/phi/kernels/impl/shape_kernel_impl.h
+++ b/paddle/phi/ops/compat/qr_sig.cc
@@ -12,25 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/compat/op_utils.h"
 
 namespace phi {
 
-template <typename T, typename Context>
-void ShapeKernel(const Context& ctx,
-                 const DenseTensor& input,
-                 DenseTensor* out) {
-  auto in_var = &input;
-  phi::DDim in_dims;
-  in_dims = in_var->dims();
-  auto out_t = out;
-  out_t->Resize({in_dims.size()});
-  auto out_data = ctx.template HostAlloc<int32_t>(out_t);
-  for (int i = 0; i < in_dims.size(); ++i) {
-    out_data[i] = in_dims[i];
-  }
+KernelSignature QrOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("qr", {"X"}, {"mode"}, {"Q", "R"});
 }
 
 }  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(qr, phi::QrOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 997f1505bd08d..273badee62381 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -41,8 +41,7 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "mean_raw" KernelSignature.
     // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
-    // the
-    // "mean_raw" KernelSignature
+    // the "mean_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature(
           "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
@@ -53,8 +52,19 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
 }
 
 KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "max_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "max_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "prod_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("prod", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
 }
 
 KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
@@ -63,8 +73,7 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
     // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
     // InferShape, so we must return the "max_raw" KernelSignature.
     // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
-    // the
-    // "max_raw" KernelSignature
+    // the "max_raw" KernelSignature
     if (ctx.IsForInferShape() || reduce_all) {
       return KernelSignature(
           "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
@@ -74,6 +83,50 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ReduceMinOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "min_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "min_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "min_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("min", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "any_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "any_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "any_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("any", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "all_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("all", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
 KernelSignature ReduceSumGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
@@ -83,16 +136,73 @@ KernelSignature ReduceSumGradOpArgumentMapping(
       {GradVarName("X")});
 }
 
+KernelSignature ReduceMeanGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "mean_grad",
+      {"X", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+KernelSignature ReduceMaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "max_grad",
+      {"X", "Out", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+KernelSignature ReduceMinGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "min_grad",
+      {"X", "Out", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
+KernelSignature ReduceProdGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "prod_grad",
+      {"X", "Out", GradVarName("Out")},
+      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
+      {GradVarName("X")});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_prod, prod);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any);
+
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_mean_grad, mean_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_prod_grad, prod_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_max_grad, max_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_min_grad, min_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_min, phi::ReduceMinOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_all, phi::ReduceAllOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_any, phi::ReduceAnyOpArgumentMapping);
+
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad,
                            phi::ReduceSumGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_mean_grad,
+                           phi::ReduceMeanGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_prod_grad,
+                           phi::ReduceProdGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_max_grad,
+                           phi::ReduceMaxGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_min_grad,
+                           phi::ReduceMinGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reverse_sig.cc b/paddle/phi/ops/compat/reverse_sig.cc
new file mode 100644
index 0000000000000..0b70893fa7877
--- /dev/null
+++ b/paddle/phi/ops/compat/reverse_sig.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature ReverseOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorVectorInput("X")) {
+    return KernelSignature("reverse_array", {"X"}, {"axis"}, {"Out"});
+  } else {
+    return KernelSignature("reverse", {"X"}, {"axis"}, {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(reverse, phi::ReverseOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/roi_align_sig.cc b/paddle/phi/ops/compat/roi_align_sig.cc
index 0549103b6fbcb..1717ec8f78809 100644
--- a/paddle/phi/ops/compat/roi_align_sig.cc
+++ b/paddle/phi/ops/compat/roi_align_sig.cc
@@ -16,7 +16,7 @@
 
 namespace phi {
 
-KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
+KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("roi_align",
                          {"X", "ROIs", "RoisNum"},
                          {"pooled_height",
@@ -27,6 +27,19 @@ KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
                          {"Out"});
 }
 
+KernelSignature RoiAlignGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_align_grad",
+                         {"X", "ROIs", "RoisNum", GradVarName("Out")},
+                         {"pooled_height",
+                          "pooled_width",
+                          "spatial_scale",
+                          "sampling_ratio",
+                          "aligned"},
+                         {GradVarName("X")});
+}
+
 }  // namespace phi
 
-PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::ROIAlignOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::RoiAlignOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_align_grad, phi::RoiAlignGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/roi_pool_sig.cc b/paddle/phi/ops/compat/roi_pool_sig.cc
new file mode 100644
index 0000000000000..d04c645f183c6
--- /dev/null
+++ b/paddle/phi/ops/compat/roi_pool_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RoiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_pool",
+                         {"X", "ROIs", "RoisNum"},
+                         {"pooled_height", "pooled_width", "spatial_scale"},
+                         {"Out", "Argmax"});
+}
+
+KernelSignature RoiPoolOpGradArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_pool_grad",
+                         {"X", "ROIs", "RoisNum", "Argmax", GradVarName("Out")},
+                         {"pooled_height", "pooled_width", "spatial_scale"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roi_pool, phi::RoiPoolOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_pool_grad, phi::RoiPoolOpGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/roll_sig.cc b/paddle/phi/ops/compat/roll_sig.cc
new file mode 100644
index 0000000000000..a144f0e8e8a90
--- /dev/null
+++ b/paddle/phi/ops/compat/roll_sig.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.HasInput("ShiftsTensor")) {
+    return KernelSignature("roll", {"X"}, {"ShiftsTensor", "axis"}, {"Out"});
+  }
+  return KernelSignature("roll", {"X"}, {"shifts", "axis"}, {"Out"});
+}
+
+KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("roll_grad",
+                         {"X", GradVarName("Out")},
+                         {"shifts", "axis"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(roll, phi::RollOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roll_grad, phi::RollGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc
index 9653250bded84..5feff54b028ba 100644
--- a/paddle/phi/ops/compat/set_value_sig.cc
+++ b/paddle/phi/ops/compat/set_value_sig.cc
@@ -19,9 +19,9 @@ namespace phi {
 
 KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.IsDenseTensorInput("Input")) {
-    if (ctx.HasInput("StartsTensorList")) {
-      if (ctx.HasInput("EndsTensorList")) {
-        if (ctx.HasInput("StepsTensorList")) {
+    if (ctx.InputSize("StartsTensorList") > 0) {
+      if (ctx.InputSize("EndsTensorList") > 0) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -197,7 +197,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
           }
         }
       } else {
-        if (ctx.HasInput("StepsTensorList")) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -374,8 +374,8 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
         }
       }
     } else {
-      if (ctx.HasInput("EndsTensorList")) {
-        if (ctx.HasInput("StepsTensorList")) {
+      if (ctx.InputSize("EndsTensorList") > 0) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -551,7 +551,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
           }
         }
       } else {
-        if (ctx.HasInput("StepsTensorList")) {
+        if (ctx.InputSize("StepsTensorList") > 0) {
           if (ctx.HasInput("ValueTensor")) {
             return KernelSignature("set_value_with_tensor",
                                    {"Input", "ValueTensor"},
@@ -734,9 +734,9 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 KernelSignature SetValueGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
-  if (ctx.HasInput("StartsTensorList")) {
-    if (ctx.HasInput("EndsTensorList")) {
-      if (ctx.HasInput("StepsTensorList")) {
+  if (ctx.InputSize("StartsTensorList") > 0) {
+    if (ctx.InputSize("EndsTensorList") > 0) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
         return KernelSignature(
             "set_value_grad",
             {GradVarName("Out")},
@@ -760,7 +760,7 @@ KernelSignature SetValueGradOpArgumentMapping(
             {GradVarName("Input"), GradVarName("ValueTensor")});
       }
     } else {
-      if (ctx.HasInput("StepsTensorList")) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
         return KernelSignature(
             "set_value_grad",
             {GradVarName("Out")},
@@ -785,8 +785,8 @@ KernelSignature SetValueGradOpArgumentMapping(
       }
     }
   } else {
-    if (ctx.HasInput("EndsTensorList")) {
-      if (ctx.HasInput("StepsTensorList")) {
+    if (ctx.InputSize("EndsTensorList") > 0) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
         return KernelSignature(
             "set_value_grad",
             {GradVarName("Out")},
@@ -810,7 +810,7 @@ KernelSignature SetValueGradOpArgumentMapping(
             {GradVarName("Input"), GradVarName("ValueTensor")});
       }
     } else {
-      if (ctx.HasInput("StepsTensorList")) {
+      if (ctx.InputSize("StepsTensorList") > 0) {
         return KernelSignature(
             "set_value_grad",
             {GradVarName("Out")},
diff --git a/paddle/phi/ops/compat/squeeze_sig.cc b/paddle/phi/ops/compat/squeeze_sig.cc
new file mode 100644
index 0000000000000..276246533e89e
--- /dev/null
+++ b/paddle/phi/ops/compat/squeeze_sig.cc
@@ -0,0 +1,36 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("squeeze", {"X"}, {"axes"}, {"XShape", "Out"});
+}
+
+KernelSignature SqueezeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("squeeze_grad",
+                         {"XShape", GradVarName("Out")},
+                         {"axes"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_BASE_KERNEL_NAME(squeeze2, squeeze);
+PD_REGISTER_BASE_KERNEL_NAME(squeeze2_grad, squeeze_grad);
+PD_REGISTER_ARG_MAPPING_FN(squeeze2, phi::SqueezeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(squeeze2_grad, phi::SqueezeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/tile_sig.cc b/paddle/phi/ops/compat/tile_sig.cc
index 49a6d02225d93..ca3fa5fe1f86a 100644
--- a/paddle/phi/ops/compat/tile_sig.cc
+++ b/paddle/phi/ops/compat/tile_sig.cc
@@ -20,6 +20,11 @@ KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.HasInput("RepeatTimes")) {
     return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"});
   } else if (ctx.InputSize("repeat_times_tensor") > 0) {
+    const auto& repeat_times =
+        paddle::any_cast<std::vector<int>>(ctx.Attr("repeat_times"));
+    if (!ctx.IsRuntime() && !repeat_times.empty()) {
+      return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
+    }
     return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"});
   } else {
     return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"});
diff --git a/paddle/phi/ops/compat/tril_triu_sig.cc b/paddle/phi/ops/compat/tril_triu_sig.cc
new file mode 100644
index 0000000000000..4f79f8650decf
--- /dev/null
+++ b/paddle/phi/ops/compat/tril_triu_sig.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"});
+}
+
+KernelSignature TrilTriuGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("tril_triu_grad",
+                         {GradVarName("Out")},
+                         {"diagonal", "lower"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/unsqueeze_sig.cc b/paddle/phi/ops/compat/unsqueeze_sig.cc
new file mode 100644
index 0000000000000..20cd9701e83e5
--- /dev/null
+++ b/paddle/phi/ops/compat/unsqueeze_sig.cc
@@ -0,0 +1,46 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.InputSize("AxesTensorList") > 0) {
+    VLOG(2) << "unsqueeze2 in AxesTensorList";
+    return KernelSignature(
+        "unsqueeze", {"X"}, {"AxesTensorList"}, {"XShape", "Out"});
+  } else if (ctx.InputSize("AxesTensor") > 0) {
+    VLOG(2) << "unsqueeze2 in AxesTensor";
+    return KernelSignature(
+        "unsqueeze", {"X"}, {"AxesTensor"}, {"XShape", "Out"});
+  } else {
+    VLOG(2) << "unsqueeze2 in axes";
+    return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"XShape", "Out"});
+  }
+}
+
+KernelSignature UnsqueezeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "unsqueeze_grad", {"XShape", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2, unsqueeze);
+PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2_grad, unsqueeze_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(unsqueeze2, phi::UnsqueezeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(unsqueeze2_grad,
+                           phi::UnsqueezeGradOpArgumentMapping);
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index dd008ff36d50a..a2bd1f2cad9fc 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -39,10 +40,10 @@ TEST(API, data_transform_same_place) {
   auto x = paddle::experimental::full({3, 3},
                                       1.0,
                                       experimental::DataType::COMPLEX128,
-                                      experimental::Backend::CPU);
+                                      experimental::CPUPlace());
 
   auto y = paddle::experimental::full(
-      {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::Backend::CPU);
+      {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::CPUPlace());
 
   std::vector<phi::dtype::complex<double>> sum(9, 6.0);
 
@@ -74,10 +75,10 @@ TEST(API, data_transform_same_place) {
 TEST(Tensor, data_transform_diff_place) {
   // 1. create tensor
   auto x = paddle::experimental::full(
-      {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::Backend::CPU);
+      {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::CPUPlace());
 
   auto y = paddle::experimental::full(
-      {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::Backend::GPU);
+      {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::GPUPlace());
 
   std::vector<float> sum(9, 6.0);
 
@@ -95,7 +96,7 @@ TEST(Tensor, data_transform_diff_place) {
   ASSERT_EQ(out.impl()->place(),
             phi::TransToPhiPlace(experimental::Backend::GPU));
 
-  auto ref_out = experimental::copy_to(out, experimental::Backend::CPU, true);
+  auto ref_out = experimental::copy_to(out, experimental::CPUPlace(), true);
 
   auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(ref_out.impl());
   for (size_t i = 0; i < 9; i++) {
diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc
index 05a5563344966..ca4a264e511bd 100644
--- a/paddle/phi/tests/api/test_scale_benchmark.cc
+++ b/paddle/phi/tests/api/test_scale_benchmark.cc
@@ -30,7 +30,7 @@ namespace tests {
 
 TEST(API, scale) {
   auto x = experimental::full(
-      {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::Backend::CPU);
+      {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::CPUPlace());
 
   const size_t cycles = 300;
   phi::tests::Timer timer;
diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc
index 8595782be35ab..da66334ced78a 100644
--- a/paddle/phi/tests/api/test_sparse_utils_api.cc
+++ b/paddle/phi/tests/api/test_sparse_utils_api.cc
@@ -53,8 +53,7 @@ TEST(API, to_sparse_coo) {
 
   // 1. test dense_to_sparse_coo
   paddle::experimental::Tensor x(dense_x);
-  auto out = paddle::experimental::sparse::to_sparse_coo(
-      x, phi::Backend::CPU, sparse_dim);
+  auto out = paddle::experimental::sparse::to_sparse_coo(x, sparse_dim);
   auto coo = std::dynamic_pointer_cast<phi::SparseCooTensor>(out.impl());
   ASSERT_EQ(coo->nnz(), non_zero_num);
   int cmp_indices = memcmp(coo->non_zero_indices().data<int64_t>(),
@@ -91,8 +90,7 @@ TEST(API, to_sparse_coo) {
   auto csr =
       std::make_shared<phi::SparseCsrTensor>(crows, cols, values, dense_dims);
   paddle::experimental::Tensor csr_x(csr);
-  auto out2 = paddle::experimental::sparse::to_sparse_coo(
-      csr_x, phi::Backend::CPU, sparse_dim);
+  auto out2 = paddle::experimental::sparse::to_sparse_coo(csr_x, sparse_dim);
 
   auto coo2 = std::dynamic_pointer_cast<phi::SparseCooTensor>(out.impl());
   ASSERT_EQ(coo2->nnz(), non_zero_num);
@@ -132,7 +130,7 @@ TEST(API, to_sparse_csr) {
 
   // 1. test dense_to_sparse_csr
   paddle::experimental::Tensor x(dense_x);
-  auto out = paddle::experimental::sparse::to_sparse_csr(x, phi::Backend::CPU);
+  auto out = paddle::experimental::sparse::to_sparse_csr(x);
   auto csr = std::dynamic_pointer_cast<phi::SparseCsrTensor>(out.impl());
   auto check = [&](const phi::SparseCsrTensor& csr) {
     ASSERT_EQ(csr.non_zero_cols().numel(), non_zero_num);
@@ -170,8 +168,7 @@ TEST(API, to_sparse_csr) {
   auto coo =
       std::make_shared<phi::SparseCooTensor>(indices, values, dense_dims);
   paddle::experimental::Tensor coo_x(coo);
-  auto out2 =
-      paddle::experimental::sparse::to_sparse_csr(coo_x, phi::Backend::CPU);
+  auto out2 = paddle::experimental::sparse::to_sparse_csr(coo_x);
 
   auto csr2 = std::dynamic_pointer_cast<phi::SparseCsrTensor>(out.impl());
   check(*csr2);
@@ -212,7 +209,7 @@ TEST(API, to_dense) {
       std::make_shared<phi::SparseCooTensor>(indices, values, dense_dims);
 
   paddle::experimental::Tensor coo_x(coo);
-  auto out = paddle::experimental::sparse::to_dense(coo_x, phi::Backend::CPU);
+  auto out = paddle::experimental::sparse::to_dense(coo_x);
   auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(out.impl());
   int cmp1 =
       memcmp(dense_out->data<float>(), &dense_data[0][0], 9 * sizeof(float));
@@ -237,7 +234,7 @@ TEST(API, to_dense) {
   auto csr =
       std::make_shared<phi::SparseCsrTensor>(crows, cols, values, dense_dims);
   paddle::experimental::Tensor csr_x(csr);
-  auto out2 = paddle::experimental::sparse::to_dense(csr_x, phi::Backend::CPU);
+  auto out2 = paddle::experimental::sparse::to_dense(csr_x);
 
   auto dense_out2 = std::dynamic_pointer_cast<phi::DenseTensor>(out.impl());
   int cmp2 =
diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc
index 66c478e4c0001..4e8755be0c773 100644
--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -69,10 +69,10 @@ TEST(API, copy_to) {
 
 // 2. test API
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto tmp = paddle::experimental::copy_to(x, phi::Backend::GPU, false);
-  auto out = paddle::experimental::copy_to(tmp, phi::Backend::CPU, true);
+  auto tmp = paddle::experimental::copy_to(x, phi::GPUPlace(), false);
+  auto out = paddle::experimental::copy_to(tmp, phi::CPUPlace(), true);
 #else
-  auto out = paddle::experimental::copy_to(x, phi::Backend::CPU, false);
+  auto out = paddle::experimental::copy_to(x, phi::CPUPlace(), false);
 #endif
 
   // 3. check result
@@ -85,10 +85,10 @@ TEST(Tensor, copy_to) {
 
 // 2. test API
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto tmp = x.copy_to(phi::Backend::GPU, false);
-  auto out = tmp.copy_to(phi::Backend::CPU, true);
+  auto tmp = x.copy_to(phi::GPUPlace(), false);
+  auto out = tmp.copy_to(phi::CPUPlace(), true);
 #else
-  auto out = x.copy_to(phi::Backend::CPU, false);
+  auto out = x.copy_to(phi::CPUPlace(), false);
 #endif
 
   // 3. check result
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index 317dcce92c8ed..3897c182e481c 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -14,6 +14,7 @@ cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS phi phi_api_utils)
 cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS phi phi_api_utils)
 cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS phi phi_api_utils)
 cc_test(test_sparse_conv3d_dev_api SRCS test_sparse_conv3d_dev_api.cc DEPS phi phi_api_utils)
+cc_test(test_sparse_pool_dev_api SRCS test_sparse_pool_dev_api.cc DEPS phi phi_api_utils)
 
 cc_test(test_math_function SRCS test_math_function.cc DEPS math_function)
 if(WITH_GPU)
diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc
index d69c7b2174f72..460d85f83133f 100644
--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -61,6 +61,10 @@ TEST(DEV_API, copy) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
   phi::Copy(
       dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());
diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
index 3e5f965074156..9552c02976f30 100644
--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/elementwise_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
index dc283728ee5f7..e3f2e8b57e3df 100644
--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -58,6 +58,10 @@ TEST(DEV_API, flatten) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
 
   // 2. test API
diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc
index 23edfeacaf814..ce31b2021e01a 100644
--- a/paddle/phi/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/kernels/test_reshape_dev_api.cc b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
index 16ad4fc341be0..7de039372fa9c 100644
--- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
@@ -50,6 +50,10 @@ TEST(DEV_API, reshape) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
+  dev_ctx.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
   dev_ctx.Init();
   auto out = phi::Reshape<float>(dev_ctx, dense_x, shape);
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 37a69a176c6e1..4800e1402ba56 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -132,16 +132,17 @@ void TestConv3dBase(const std::vector<int>& indices,
     f_verify(out.non_zero_elements().data<T>(), correct_out_features);
 
     if (backward) {
-      std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_cpu,
-                                                             x_tensor,
-                                                             rulebook,
-                                                             kernel_tensor,
-                                                             out,
-                                                             paddings,
-                                                             dilations,
-                                                             strides,
-                                                             1,
-                                                             subm);
+      std::vector<DenseTensor> grads =
+          sparse::Conv3dGrad<T>(dev_ctx_cpu,
+                                x_tensor,
+                                rulebook,
+                                kernel_tensor,
+                                out.non_zero_elements(),
+                                paddings,
+                                dilations,
+                                strides,
+                                1,
+                                subm);
       f_verify(grads[0].data<T>(), features_grad);
       f_verify(grads[1].data<T>(), kernel_grad);
     }
@@ -231,16 +232,17 @@ void TestConv3dBase(const std::vector<int>& indices,
   f_verify(h_features_tensor.data<T>(), correct_out_features);
 
   if (backward) {
-    std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_gpu,
-                                                           d_x_tensor,
-                                                           d_rulebook,
-                                                           d_kernel_tensor,
-                                                           d_out,
-                                                           paddings,
-                                                           dilations,
-                                                           strides,
-                                                           1,
-                                                           subm);
+    std::vector<DenseTensor> grads =
+        sparse::Conv3dGrad<T>(dev_ctx_gpu,
+                              d_x_tensor,
+                              d_rulebook,
+                              d_kernel_tensor,
+                              d_out.non_zero_elements(),
+                              paddings,
+                              dilations,
+                              strides,
+                              1,
+                              subm);
     DenseTensor h_features_grad = phi::Empty(
         dev_ctx_cpu,
         DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout()));
diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
new file mode 100644
index 0000000000000..27673704168c9
--- /dev/null
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -0,0 +1,391 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace tests {
+
+template <typename T1, typename T2>
+std::vector<T2> cast(const std::vector<T1>& in) {
+  std::vector<T2> out(in.size());
+  for (uint64_t i = 0; i < in.size(); i++) {
+    out[i] = static_cast<T2>(in[i]);
+  }
+  return out;
+}
+template <typename T>
+void TestMaxPoolBase(const std::vector<int>& indices,
+                     const std::vector<T>& features,
+                     const DDim& x_dims,
+                     const std::vector<int>& correct_out_indices,
+                     const std::vector<T>& correct_out_features,
+                     const DDim& correct_out_dims,
+                     const int non_zero_num,
+                     const std::vector<int>& kernel_sizes,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& dilations,
+                     const float diff = 1e-3,
+                     const bool backward = false,
+                     const std::vector<T> features_grad = {}) {
+  phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx_cpu.Init();
+
+  const int in_channels = x_dims[4];
+  const int out_channels = in_channels;
+
+  DenseTensor indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  memcpy(
+      indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
+  DenseTensor features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  memcpy(
+      features_tensor.data<T>(), features.data(), features.size() * sizeof(T));
+
+  SparseCooTensor x_tensor(indices_tensor, features_tensor, x_dims);
+
+  auto f_verify = [&](const T* real_data, const std::vector<T>& correct_data) {
+    for (uint64_t i = 0; i < correct_data.size(); i++) {
+      float tmp = std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
+      ASSERT_LT(tmp, diff);
+    }
+  };
+
+  if (!std::is_same<T, phi::dtype::float16>::value) {
+    DenseTensor rulebook = phi::Empty(
+        dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+    SparseCooTensor out = sparse::MaxPool<T>(dev_ctx_cpu,
+                                             x_tensor,
+                                             kernel_sizes,
+                                             paddings,
+                                             dilations,
+                                             strides,
+                                             &rulebook);
+
+    ASSERT_EQ(correct_out_dims.size(), out.dims().size());
+    for (int i = 0; i < correct_out_dims.size(); i++) {
+      ASSERT_EQ(correct_out_dims[i], out.dims()[i]);
+    }
+    ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz());
+
+    int cmp_indices = memcmp(correct_out_indices.data(),
+                             out.non_zero_indices().data<int>(),
+                             correct_out_indices.size() * sizeof(int));
+    ASSERT_EQ(cmp_indices, 0);
+
+    f_verify(out.non_zero_elements().data<T>(), correct_out_features);
+
+    if (backward) {
+      DenseTensor x_grad = sparse::MaxPoolGrad<T>(dev_ctx_cpu,
+                                                  x_tensor,
+                                                  rulebook,
+                                                  out,
+                                                  out.non_zero_elements(),
+                                                  kernel_sizes);
+      f_verify(x_grad.data<T>(), features_grad);
+    }
+  }
+
+// test gpu
+#if defined(PADDLE_WITH_CUDA)
+  phi::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
+  DenseTensor d_indices_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  phi::Copy(
+      dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
+
+  DenseTensor d_features_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  phi::Copy(
+      dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
+
+  SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
+
+  DenseTensor d_rulebook = phi::Empty(
+      dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
+  SparseCooTensor d_out = sparse::MaxPool<T>(dev_ctx_gpu,
+                                             d_x_tensor,
+                                             kernel_sizes,
+                                             paddings,
+                                             dilations,
+                                             strides,
+                                             &d_rulebook);
+
+  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
+  ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
+  for (int i = 0; i < correct_out_dims.size(); i++) {
+    ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]);
+  }
+
+  DenseTensor h_indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_indices(),
+            phi::CPUPlace(),
+            true,
+            &h_indices_tensor);
+
+  int cmp_indices2 = memcmp(correct_out_indices.data(),
+                            h_indices_tensor.data<int>(),
+                            correct_out_indices.size() * sizeof(int));
+  ASSERT_EQ(cmp_indices2, 0);
+
+  DenseTensor h_features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {d_out.nnz()},
+                      d_out.layout()));
+
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_elements(),
+            phi::CPUPlace(),
+            true,
+            &h_features_tensor);
+  f_verify(h_features_tensor.data<T>(), correct_out_features);
+
+  if (backward) {
+    DenseTensor x_grad = sparse::MaxPoolGrad<T>(dev_ctx_gpu,
+                                                d_x_tensor,
+                                                d_rulebook,
+                                                d_out,
+                                                d_out.non_zero_elements(),
+                                                kernel_sizes);
+    DenseTensor h_features_grad = phi::Empty(
+        dev_ctx_cpu,
+        DenseTensorMeta(x_grad.dtype(), x_grad.dims(), x_grad.layout()));
+    phi::Copy(dev_ctx_gpu, x_grad, phi::CPUPlace(), true, &h_features_grad);
+    f_verify(h_features_grad.data<T>(), features_grad);
+  }
+#endif
+}
+
+void TestMaxPool(const std::vector<int>& indices,
+                 const std::vector<float>& features,
+                 const DDim& x_dims,
+                 const std::vector<int>& correct_out_indices,
+                 const std::vector<float>& correct_out_features,
+                 const DDim& correct_out_dims,
+                 const int non_zero_num,
+                 const std::vector<int>& kernel_sizes,
+                 const std::vector<int>& paddings,
+                 const std::vector<int>& strides,
+                 const std::vector<int>& dilations,
+                 const float diff = 1e-3,
+                 const bool backward = false,
+                 const std::vector<float> features_grad = {}) {
+  // test float
+  TestMaxPoolBase<float>(indices,
+                         features,
+                         x_dims,
+                         correct_out_indices,
+                         correct_out_features,
+                         correct_out_dims,
+                         non_zero_num,
+                         kernel_sizes,
+                         paddings,
+                         strides,
+                         dilations,
+                         diff,
+                         backward,
+                         features_grad);
+  // test double
+  TestMaxPoolBase<double>(indices,
+                          cast<float, double>(features),
+                          x_dims,
+                          correct_out_indices,
+                          cast<float, double>(correct_out_features),
+                          correct_out_dims,
+                          non_zero_num,
+                          kernel_sizes,
+                          paddings,
+                          strides,
+                          dilations,
+                          diff,
+                          backward,
+                          cast<float, double>(features_grad));
+}
+
+TEST(DEV_API, sparse_maxpool) {
+  const int channels = 1;
+  DDim x_dims = {1, 1, 4, 4, channels};
+  DDim out_dims = {1, 1, 2, 2, channels};
+  std::vector<int> kernel_sizes = {1, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 2, 3};
+  std::vector<int> out_indices = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+  };
+  std::vector<float> out_features = {2, 2, 3, 3};
+  std::vector<float> x_grad = {0, 4, 6};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+TEST(DEV_API, sparse_maxpool_stride) {
+  const int channels = 1;
+  DDim x_dims = {1, 1, 4, 4, channels};
+  DDim out_dims = {1, 1, 1, 1, channels};
+  std::vector<int> kernel_sizes = {1, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {2, 2, 2};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 2, 3};
+  std::vector<int> out_indices = {0, 0, 0, 0};
+  std::vector<float> out_features = {2};
+  std::vector<float> x_grad = {0, 2, 0};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+TEST(DEV_API, sparse_maxpool_channel) {
+  const int channels = 2;
+  DDim x_dims = {1, 1, 4, 4, channels};
+  DDim out_dims = {1, 1, 2, 2, channels};
+  std::vector<int> kernel_sizes = {1, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 1, 2, 2, 3, 3};
+  std::vector<int> out_indices = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+  };
+  std::vector<float> out_features = {2, 2, 2, 2, 3, 3, 3, 3};
+  std::vector<float> x_grad = {0, 0, 4, 4, 6, 6};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+TEST(DEV_API, sparse_maxpool3d) {
+  const int channels = 2;
+  DDim x_dims = {1, 5, 4, 4, channels};
+  DDim out_dims = {1, 3, 2, 2, channels};
+  std::vector<int> kernel_sizes = {3, 3, 3};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 3;
+  std::vector<int> indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2};
+  std::vector<float> features = {1, 1, 2, 2, 3, 3};
+  std::vector<int> out_indices = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+  };
+  std::vector<float> out_features = {2, 2, 2, 2, 3, 3, 3, 3};
+  std::vector<float> x_grad = {0, 0, 4, 4, 6, 6};
+
+  TestMaxPool(indices,
+              features,
+              x_dims,
+              out_indices,
+              out_features,
+              out_dims,
+              non_zero_num,
+              kernel_sizes,
+              paddings,
+              strides,
+              dilations,
+              1e-6,
+              true,
+              x_grad);
+}
+
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc
index dfec291bc072f..82fa90c1574bd 100644
--- a/paddle/phi/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/phi/kernels/math_kernel.h"
+#include "paddle/phi/kernels/reduce_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index 06048f33d940a..8468dad10eb64 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -72,6 +72,11 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
     return selected_rows_inputs.count(name) > 0;
   }
 
+  // add member if needed
+  bool IsDenseTensorVectorInput(const std::string& name) const override {
+    return false;
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     return dense_tensor_outputs.count(name) > 0;
   }
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 76b45ff89f186..37e19b49f1cd0 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -44,6 +44,9 @@ function update_pd_ops() {
    cd ${PADDLE_ROOT}/tools/infrt/
    python3 generate_pd_op_dialect_from_paddle_op_maker.py
    python3 generate_phi_kernel_dialect.py
+   # generate test model
+   cd ${PADDLE_ROOT}
+   python3 paddle/infrt/tests/model/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs
 }
 
 function init() {
@@ -93,7 +96,7 @@ function infrt_gen_and_build() {
         exit 7;
     fi
 
-    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-ir-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
+    make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
@@ -111,6 +114,7 @@ function create_fake_models() {
     python3 -m pip install  *whl
     cd ${PADDLE_ROOT}/build
     python3 ${PADDLE_ROOT}/tools/infrt/fake_models/multi_fc.py
+    python3 ${PADDLE_ROOT}/paddle/infrt/tests/model/linear.py
 }
 
 function test_infrt() {
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 75afa4ef43ff6..78a863040ade1 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -55,7 +55,6 @@ wmic process where name="python.exe" call terminate 2>NUL
 
 rem ------initialize common variable------
 if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64"
-if not defined BRANCH set BRANCH=develop
 if not defined WITH_TENSORRT set WITH_TENSORRT=ON
 if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto
@@ -70,7 +69,6 @@ if not defined WITH_ONNXRUNTIME set WITH_ONNXRUNTIME=OFF
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
 if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
 if not defined WITH_TPCACHE set WITH_TPCACHE=OFF
-if not defined WITH_CLCACHE set WITH_CLCACHE=OFF
 if not defined WITH_CACHE set WITH_CACHE=OFF
 if not defined WITH_SCCACHE set WITH_SCCACHE=OFF
 if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF
@@ -145,17 +143,6 @@ if %day_now% NEQ %day_before% (
     echo %day_now% > %cache_dir%\day.txt
     type %cache_dir%\day.txt
     rmdir %BUILD_DIR% /s/q
-
-    : clear third party cache every once in a while
-    if %day_now% EQU 21 (
-        rmdir %cache_dir%\third_party /s/q
-    )
-    if %day_now% EQU 11 (
-        rmdir %cache_dir%\third_party /s/q
-    )
-    if %day_now% EQU 01 (
-        rmdir %cache_dir%\third_party /s/q
-    )
     goto :mkbuild
 )
 
@@ -212,6 +199,7 @@ echo There is not sccache in this PC, will install sccache.
 echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe
 %PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe')"
 xcopy sccache.exe %PYTHON_ROOT%\ /Y
+del sccache.exe
 goto:eof
 rem -------Caching strategy 2: End --------------------------------
 
@@ -232,13 +220,12 @@ set WITH_AVX=ON
 set MSVC_STATIC_CRT=OFF
 set ON_INFER=OFF
 set WITH_TENSORRT=ON
+set WITH_INFERENCE_API_TEST=OFF
 
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 call :test_unit || goto test_unit_error
-:: call :test_inference || goto test_inference_error
-:: call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 rem ------PR CI windows check for OPENBLAS/CPU------
@@ -254,8 +241,6 @@ call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 call :test_unit || goto test_unit_error
-:: call :test_inference || goto test_inference_error
-:: call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 rem ------PR CI windows check for unittests and inference in CUDA11-MKL-AVX----------
@@ -265,7 +250,6 @@ set WITH_GPU=ON
 set WITH_AVX=ON
 set MSVC_STATIC_CRT=ON
 set ON_INFER=ON
-set WITH_TESTING=ON
 set WITH_TENSORRT=ON
 set WITH_INFERENCE_API_TEST=ON
 
@@ -274,7 +258,8 @@ call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
 call :test_unit || goto test_unit_error
 ::call :test_inference || goto test_inference_error
-:: call :check_change_of_unittest || goto check_change_of_unittest_error
+::call :test_inference_ut || goto test_inference_ut_error
+call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
 rem ------Build windows avx whl package------
@@ -365,18 +350,6 @@ if "%WITH_GPU%"=="ON" (
     nvidia-smi 2>NUL
 )
 
-rem ------pre install clcache and init config----------
-rem pip install clcache --user
-pip uninstall -y clcache
-:: set USE_CLCACHE to enable clcache
-rem set USE_CLCACHE=1
-:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
-rem set CLCACHE_HARDLINK=1
-:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
-rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
-:: set maximum cache size to 20G
-rem clcache.exe -M 21474836480
-
 rem ------set third_party cache dir------
 
 if "%WITH_TPCACHE%"=="OFF" (
@@ -384,6 +357,25 @@ if "%WITH_TPCACHE%"=="OFF" (
     goto :cmake_impl
 )
 
+rem clear third party cache every ten days
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
+set day_now=%datetime:~6,2%
+set day_before=-1
+set /p day_before=< %cache_dir%\day_third_party.txt
+if %day_now% NEQ %day_before% (
+    echo %day_now% > %cache_dir%\day_third_party.txt
+    type %cache_dir%\day_third_party.txt
+    if %day_now% EQU 21 (
+        rmdir %cache_dir%\third_party /s/q
+    )
+    if %day_now% EQU 11 (
+        rmdir %cache_dir%\third_party /s/q
+    )
+    if %day_now% EQU 01 (
+        rmdir %cache_dir%\third_party /s/q
+    )
+)
+
 echo set -ex > cache.sh
 echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake  ^|md5sum ^| awk '{print $1}') >> cache.sh
 echo echo ${md5_content}^>md5.txt >> cache.sh
@@ -535,11 +527,7 @@ echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
     ninja all
 ) else (
-    if "%WITH_CLCACHE%"=="OFF" (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
-    ) else (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
-    )
+    MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
 )
 
 if %ERRORLEVEL% NEQ 0 (
@@ -774,77 +762,8 @@ echo    ========================================
 echo    Step 6. Check whether deleting a unit test ...
 echo    ========================================
 
-cd /d %work_dir%\%BUILD_DIR%
-echo set -e>  check_change_of_unittest.sh
-echo set +x>> check_change_of_unittest.sh
-echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >>  check_change_of_unittest.sh
-echo GIT_PR_ID=%AGILE_PULL_ID% >>  check_change_of_unittest.sh
-echo BRANCH=%BRANCH%>>  check_change_of_unittest.sh
-echo if [ "${GITHUB_API_TOKEN}" == "" ] ^|^| [ "${GIT_PR_ID}" == "" ];then>> check_change_of_unittest.sh
-echo     exit 0 >>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo set -x>> check_change_of_unittest.sh
-echo cat ^<^<EOF>>  check_change_of_unittest.sh
-echo     ============================================ >>  check_change_of_unittest.sh
-echo     Generate unit tests.spec of this PR.         >>  check_change_of_unittest.sh
-echo     ============================================ >>  check_change_of_unittest.sh
-echo EOF>>  check_change_of_unittest.sh
-echo spec_path=$(pwd)/UNITTEST_PR.spec>>  check_change_of_unittest.sh
-echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
-echo num=$(awk 'END{print NR}' ${spec_path})>> check_change_of_unittest.sh
-echo echo "Windows 1 card TestCases count is $num">> check_change_of_unittest.sh
-echo echo ipipe_log_param_Windows_1_Card_TestCases_Count: $num>> check_change_of_unittest.sh
-echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>>  check_change_of_unittest.sh
-echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>>  check_change_of_unittest.sh
-echo if [ "$origin_upstream_url" == "" ]; then>>  check_change_of_unittest.sh
-echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
-echo elif [ "$origin_upstream_url" ^!= "$UPSTREAM_URL" ] ^\>>  check_change_of_unittest.sh
-echo         ^&^& [ "$origin_upstream_url" ^!= "$UPSTREAM_URL.git" ]; then>>  check_change_of_unittest.sh
-echo     git remote remove upstream>>  check_change_of_unittest.sh
-echo     git remote add upstream $UPSTREAM_URL.git>>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo if [ ! -e "$(pwd)/../.git/refs/remotes/upstream/$BRANCH" ]; then>>  check_change_of_unittest.sh
-echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo git checkout -b origin_pr >>  check_change_of_unittest.sh
-echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
--DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
--DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
--DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
--DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME%  >>  check_change_of_unittest.sh
-echo cat ^<^<EOF>>  check_change_of_unittest.sh
-echo     ============================================       >>  check_change_of_unittest.sh
-echo     Generate unit tests.spec of develop.               >>  check_change_of_unittest.sh
-echo     ============================================       >>  check_change_of_unittest.sh
-echo EOF>>  check_change_of_unittest.sh
-echo spec_path=$(pwd)/UNITTEST_DEV.spec>>  check_change_of_unittest.sh
-echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
-echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/UNITTEST_DEV.spec $(pwd)/UNITTEST_PR.spec`>>  check_change_of_unittest.sh
-echo if [ "$unittest_spec_diff" ^!= "" ]; then>>  check_change_of_unittest.sh
-echo     set +x>> check_change_of_unittest.sh
-echo     approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>>  check_change_of_unittest.sh
-echo     set -x>> check_change_of_unittest.sh
-echo     if [ "$approval_line" ^!= "" ]; then>>  check_change_of_unittest.sh
-echo         APPROVALS=`echo ${approval_line} ^|python $(pwd)/../tools/check_pr_approval.py 1 22165420 52485244 6836917`>>  check_change_of_unittest.sh
-echo         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}">>  check_change_of_unittest.sh
-echo         if [ "${APPROVALS}" == "FALSE" ]; then>>  check_change_of_unittest.sh
-echo             echo "************************************"                >>  check_change_of_unittest.sh
-echo             echo -e "It is forbidden to disable or delete the unit-test.\n"        >>  check_change_of_unittest.sh
-echo             echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]."     >>  check_change_of_unittest.sh
-echo             echo -e "Then you must have one RD (kolinwei(recommended) or zhouwei25) approval for the deletion of unit-test. \n"                 >>  check_change_of_unittest.sh
-echo             echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n"   >>  check_change_of_unittest.sh
-echo             echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n"     >>  check_change_of_unittest.sh
-echo             echo "************************************"                >>  check_change_of_unittest.sh
-echo             exit 1 >>  check_change_of_unittest.sh
-echo          fi>>  check_change_of_unittest.sh
-echo     else>>  check_change_of_unittest.sh
-echo          exit 1 >>  check_change_of_unittest.sh
-echo     fi>>  check_change_of_unittest.sh
-echo fi>>  check_change_of_unittest.sh
-echo git checkout -f origin_pr >>  check_change_of_unittest.sh
-%cache_dir%\tools\busybox64.exe bash check_change_of_unittest.sh
+%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\check_change_of_unittest.sh
+
 goto:eof
 
 :check_change_of_unittest_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 39676b916e504..bc19b50616d13 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -774,12 +774,12 @@ set +x
         get_precision_ut_mac
         ut_actual_total_startTime_s=`date +%s`
         if [[ "$on_precision" == "0" ]];then
-            ctest -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+            ctest -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
         else
-            ctest -R "$UT_list_prec" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+            ctest -R "$UT_list_prec" -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
             tmpfile_rand=`date +%s%N`
             tmpfile=$tmp_dir/$tmpfile_rand
-            ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+            ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
         fi
         ut_total_endTime_s=`date +%s`
         echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_actual_total_startTime_s ]s"
@@ -848,7 +848,7 @@ set +x
                                 fi
                             done
                         failed_test_lists=''
-                        ctest -R "$retry_unittests_regular" --output-on-failure -j 2 | tee $tmpfile
+                        ctest -R "$retry_unittests_regular" --timeout 120 --output-on-failure -j 2 | tee $tmpfile
                         collect_failed_tests
                         rm -f $tmp_dir/*
                         exec_times=$[$exec_times+1]
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index a0ae9bc29dabe..fdb7a3b2cb447 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from .spawn import spawn  # noqa: F401
-from .fleet.launch import launch  # noqa: F401
+from .launch.main import launch  # noqa: F401
 
 from .parallel import init_parallel_env  # noqa: F401
 from .parallel import get_rank  # noqa: F401
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index ae2d9163435b9..e303ce1216822 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -21,11 +21,12 @@
 
 from .utils import print_program_with_dist_attr
 from .operators import find_best_compatible_distributed_operator_impl
-from .dist_context import get_default_distributed_context
+from .dist_context import get_default_distributed_context, _node_id
 from .dist_tensor import DistributedTensor
 from .dist_op import DistributedOperator
 from .dist_attribute import TensorDistributedAttribute
 from .dist_attribute import OperatorDistributedAttribute
+from .process_mesh import ProcessMesh
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 
 
@@ -108,6 +109,20 @@ def compute_compatible_dims_mapping(dims_mapping_list):
     return compatible_result
 
 
+def merge_process_mesh_two(pm1, pm2):
+    process_set1 = set()
+    process_set2 = set()
+    if pm1 is None and pm2 is None:
+        return None
+    if pm1 is not None:
+        process_set1 = set(pm1.processes)
+    if pm2 is not None:
+        process_set2 = set(pm2.processes)
+    merged_process_set = process_set1.union(process_set2)
+    merged_process_mesh = ProcessMesh(list(merged_process_set))
+    return merged_process_mesh
+
+
 class Completer:
     def __init__(self, dist_context):
         assert dist_context is not None
@@ -119,7 +134,9 @@ def _update_tensor_node_dims_mapping(self, tensor_node, fwd=True):
             return False
         tensor_desc = tensor_node.var()
         # Skip reader tensor
-        if tensor_desc.type() == core.VarDesc.VarType.READER:
+        if tensor_desc.type() == core.VarDesc.VarType.READER \
+            or tensor_desc.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+            or tensor_desc.type == core.VarDesc.VarType.STEP_SCOPES:
             return False
         tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
             tensor_node)
@@ -185,7 +202,7 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
         op_dist_attr = dist_op.dist_attr
         if fwd:
             for tensor_node in op_node.inputs:
-                if tensor_node.var() is not None:
+                if tensor_node.is_var() and tensor_node.var() is not None:
                     if tensor_node.var().type() == core.VarDesc.VarType.READER:
                         continue
                     tensor_desc = tensor_node.var()
@@ -208,19 +225,19 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
             # Find the most compatible implemenetations from the distributed operator
             op_dist_impl = find_best_compatible_distributed_operator_impl(
                 dist_op, fwd=True)
-            assert op_dist_impl is not None, "Cannot find the dist op implementation."
-            dim_changed = op_dist_impl.update_dims_mapping(dist_op)
-            if dim_changed:
-                changed = True
-            if op_dist_impl.is_auto_compatible(dist_op):
-                if op_dist_impl.type == "elementwise":
-                    op_dist_attr.impl_type = "default"
-                else:
-                    op_dist_attr.impl_type = op_dist_impl.type
-                op_dist_attr.impl_idx = op_dist_impl.idx
+            if op_dist_impl is not None:
+                dim_changed = op_dist_impl.update_dims_mapping(dist_op)
+                if dim_changed:
+                    changed = True
+                if op_dist_impl.is_auto_compatible(dist_op):
+                    if op_dist_impl.type == "elementwise":
+                        op_dist_attr.impl_type = "default"
+                    else:
+                        op_dist_attr.impl_type = op_dist_impl.type
+                    op_dist_attr.impl_idx = op_dist_impl.idx
         else:
             for tensor_node in op_node.outputs:
-                if tensor_node.var() is not None:
+                if tensor_node.is_var() and tensor_node.var() is not None:
                     if tensor_node.var().type() == core.VarDesc.VarType.READER:
                         continue
                     tensor_desc = tensor_node.var()
@@ -243,61 +260,38 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
             # Find the most compatible implemenetations from the distributed operator
             op_dist_impl = find_best_compatible_distributed_operator_impl(
                 dist_op, fwd=False)
-            assert op_dist_impl is not None, "Cannot find the dist op implementation."
-            dim_changed = op_dist_impl.update_dims_mapping(dist_op)
-            if dim_changed:
-                changed = True
-            if op_dist_impl.is_auto_compatible(dist_op):
-                if op_dist_impl.type == "elementwise":
-                    op_dist_attr.impl_type = "default"
-                else:
-                    op_dist_attr.impl_type = op_dist_impl.type
-                op_dist_attr.impl_idx = op_dist_impl.idx
+            if op_dist_impl is not None:
+                dim_changed = op_dist_impl.update_dims_mapping(dist_op)
+                if dim_changed:
+                    changed = True
+                if op_dist_impl.is_auto_compatible(dist_op):
+                    if op_dist_impl.type == "elementwise":
+                        op_dist_attr.impl_type = "default"
+                    else:
+                        op_dist_attr.impl_type = op_dist_impl.type
+                    op_dist_attr.impl_idx = op_dist_impl.idx
         return changed
 
-    def _update_process_mesh(self):
-        def _find_nearset_node(nodes, idx):
-            for node in reversed(nodes[:idx]):
-                node_dist_attr = self._dist_context.get_dist_attr_for_graph(
-                    node)
-                if node_dist_attr.process_mesh is not None:
-                    return node
-
-        total_reach_fix_point = False
-        while not total_reach_fix_point:
-            total_changed = False
-            for is_fwd in [True, False]:
-                all_nodes = self._dist_context.serial_ordered_nodes \
-                    if is_fwd else reversed(self._dist_context.serial_ordered_nodes)
-                reach_fix_point = False
-                while not reach_fix_point:
-                    changed = False
-                    for idx, node in enumerate(all_nodes):
-                        nearest_node = _find_nearset_node(
-                            self._dist_context.serial_ordered_nodes, idx)
-                        if nearest_node is None:
-                            continue
-                        nearest_node_dis_attr = self._dist_context.get_dist_attr_for_graph(
-                            nearest_node)
-                        nearest_process_mesh = nearest_node_dis_attr.process_mesh
-                        cur_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
-                            node)
-                        cur_process_mesh = cur_node_dist_attr.process_mesh
-                        compatible_process_mesh = compute_compatible_process_mesh(
-                            [cur_process_mesh, nearest_process_mesh])
-                        if compatible_process_mesh is not None \
-                            and cur_process_mesh != compatible_process_mesh:
-                            cur_node_dist_attr.process_mesh = compatible_process_mesh
-                            changed = True
-                    if changed:
-                        reach_fix_point = False
-                        total_changed = True
-                    else:
-                        reach_fix_point = True
-            if total_changed:
-                total_reach_fix_point = False
-            else:
-                total_reach_fix_point = True
+    def _update_dims_mapping_between_graphs(self):
+        changed = False
+        for parent_node, child_node in self._node_pairs_between_graphs:
+            parent_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                parent_node)
+            child_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                child_node)
+            parent_node_dims_mapping = parent_node_dist_attr.dims_mapping
+            child_node_dims_mapping = child_node_dist_attr.dims_mapping
+            compatible_dims_mapping = compute_compatible_dims_mapping(
+                [parent_node_dims_mapping, child_node_dims_mapping])
+            if (compatible_dims_mapping is not None) \
+                and (compatible_dims_mapping != parent_node_dims_mapping):
+                parent_node_dist_attr.dims_mapping = compatible_dims_mapping
+                changed = True
+            if (compatible_dims_mapping is not None) \
+                and (compatible_dims_mapping != child_node_dims_mapping):
+                parent_node_dist_attr.dims_mapping = compatible_dims_mapping
+                changed = True
+        return changed
 
     def _update_dims_mapping(self):
         # Complete dims_mapping for each node
@@ -318,11 +312,314 @@ def _update_dims_mapping(self):
                             node, fwd=is_fwd)
                         if op_changed:
                             changed = True
+                graph_changed = self._update_dims_mapping_between_graphs()
+                if graph_changed:
+                    changed = True
             if changed:
                 reach_fix_point = False
             else:
                 reach_fix_point = True
 
+    def _update_process_mesh_by_nearest(self, op_node, nearest_op_node):
+        op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+        # Set the process mesh of the op node by its nearest op node
+        if not op_dist_attr.is_annotated("process_mesh"):
+            process_mesh = op_dist_attr.process_mesh
+            nearest_op_dis_attr = self._dist_context.get_dist_attr_for_graph(
+                nearest_op_node)
+            nearest_process_mesh = nearest_op_dis_attr.process_mesh
+            compatible_process_mesh = compute_compatible_process_mesh(
+                [process_mesh, nearest_process_mesh])
+            if compatible_process_mesh is not None \
+                and process_mesh != compatible_process_mesh:
+                op_dist_attr.process_mesh = compatible_process_mesh
+        # Skip the process_mesh setting of inputs and outputs of while_op
+        if op_dist_attr.op_type == "while":
+            return
+        # Set the process mesh of the op node's leaf-inputs
+        for tensor_node in op_node.inputs:
+            if tensor_node.is_var() and tensor_node.var() is not None:
+                tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                    tensor_node)
+                if tensor_dist_attr.is_annotated("process_mesh"):
+                    continue
+                # Skip the non-leaf var node
+                if len(tensor_node.inputs) != 0:
+                    continue
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and tensor_dist_attr.process_mesh != compatible_process_mesh:
+                    tensor_dist_attr.process_mesh = compatible_process_mesh
+        # Set the process mesh of the op node's outputs
+        for tensor_node in op_node.outputs:
+            if tensor_node.is_var() and tensor_node.var() is not None:
+                tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                    tensor_node)
+                if tensor_dist_attr.is_annotated("process_mesh"):
+                    continue
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and tensor_dist_attr.process_mesh != compatible_process_mesh:
+                    tensor_dist_attr.process_mesh = compatible_process_mesh
+
+    def _update_process_mesh_for_specials(self):
+        def _find_nearest_tensor_node_before(nodes, idx, var_name):
+            for node in reversed(nodes[:idx]):
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == var_name:
+                    return node
+
+        def _find_nearest_tensor_node_after(nodes, idx, var_name):
+            for node in nodes[idx + 1:]:
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == var_name:
+                    return node
+
+        def _find_nodes_related_to_cond(source_node):
+            related_nodes = []
+            visited = set()
+            frontier = list()
+            frontier.append(source_node)
+            # BFS
+            while len(frontier) != 0:
+                cur = frontier[0]
+                frontier = frontier[1:]
+                if _node_id(cur) in visited:
+                    continue
+                # TODO: need more restrictions
+                for node in cur.inputs:
+                    if node.is_var() and node.var() is not None:
+                        if node.var().type() != core.VarDesc.VarType.READER \
+                            and len(node.var().shape()) == 1:
+                            frontier.append(node)
+                            related_nodes.append(node)
+                    if node.is_op() and node.op() is not None:
+                        flag = True
+                        if node.op().type() == "create_py_reader" \
+                            or node.op().type() == "create_double_buffer_reader" \
+                            or node.op().type() == "read":
+                            flag = False
+                        for tensor_node in node.inputs:
+                            if tensor_node.is_var() and tensor_node.var(
+                            ) is not None:
+                                if tensor_node.var().type() == core.VarDesc.VarType.READER \
+                                    or len(tensor_node.var().shape()) != 1:
+                                    flag = False
+                                    break
+                        for tensor_node in node.outputs:
+                            if tensor_node.is_var() and tensor_node.var(
+                            ) is not None:
+                                if tensor_node.var().type() == core.VarDesc.VarType.READER \
+                                    or len(tensor_node.var().shape()) != 1:
+                                    flag = False
+                                    break
+                        if flag:
+                            frontier.append(node)
+                            related_nodes.append(node)
+                visited.add(_node_id(cur))
+            return related_nodes
+
+        # Amend the process meshes related to while_op
+        for while_op_node, while_op_node_idx in self._while_op_nodes.values():
+            sub_graph_id = while_op_node.op()._block_attr_id("sub_block")
+            sub_graph = self._dist_context._serial_graph.get_sub_graph(
+                sub_graph_id)
+            sub_graph_nodes = list(sub_graph.all_nodes())
+            while_dist_op = self._dist_context.get_dist_op_for_graph(
+                while_op_node)
+            while_op_dist_attr = while_dist_op.dist_attr
+
+            # Step 1: set the process mesh of while_op to the merged process mesh of its subblock
+            merged_process_mesh = while_op_dist_attr.process_mesh
+            for node in sub_graph_nodes:
+                if (node.is_var() and node.var() is not None) \
+                    or (node.is_op() and node.op() is not None):
+                    dist_attr = self._dist_context.get_dist_attr_for_graph(node)
+                    merged_process_mesh = merge_process_mesh_two(
+                        merged_process_mesh, dist_attr.process_mesh)
+            while_op_dist_attr.process_mesh = merged_process_mesh
+
+            # Step 2: set the related nodes of while_op to the process mesh of while_op
+            # Step 2.1: Find related nodes of cond var the graph of while_op
+            cond_tensor_related_nodes = []
+            cond_tensor_name = while_op_node.op().input("Condition")[0]
+            cond_tensor_node = None
+            for node in while_op_node.inputs:
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == cond_tensor_name:
+                    cond_tensor_node = node
+                    cond_tensor_related_nodes.append(cond_tensor_node)
+                    break
+
+            cond_tensor_related_nodes.extend(
+                _find_nodes_related_to_cond(cond_tensor_node))
+
+            # Step 2.2: Find related nodes of cond var in the subgraph of while_op
+            cond_tensor_node = None
+            for node in reversed(sub_graph_nodes):
+                if node.is_var() and node.var() is not None \
+                    and node.var().name() == cond_tensor_name \
+                        and len(node.outputs) == 0:
+                    cond_tensor_node = node
+                    break
+
+            cond_tensor_related_nodes.extend(
+                _find_nodes_related_to_cond(cond_tensor_node))
+            # Step 2.3: Add the StepScops output of while_op
+            stepscopes_tensor_name = while_op_node.op().output("StepScopes")[0]
+            stepscopes_tensor_node = None
+            for output_node in while_op_node.outputs:
+                if output_node.is_var() and output_node.var() is not None \
+                    and output_node.var().name() == stepscopes_tensor_name:
+                    stepscopes_tensor_node = output_node
+            cond_tensor_related_nodes.append(stepscopes_tensor_node)
+            # Step 2.4: Set the process meshes of all nodes related to cond var to the process mesh of while op
+            for node in cond_tensor_related_nodes:
+                tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    node)
+                tensor_dist_attr.process_mesh = merged_process_mesh
+
+            # Step 3: set the process meshes of the inputs in while_op to the process meshes of the outside input nodes
+            while_op_inputs_dist_attrs = while_op_dist_attr.inputs_dist_attrs
+            for tensor_name, tensor_dist_attr in while_op_inputs_dist_attrs.items(
+            ):
+                nearest_tensor_node = _find_nearest_tensor_node_before(
+                    self._dist_context.serial_ordered_nodes, while_op_node_idx,
+                    tensor_name)
+                nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    nearest_tensor_node)
+                tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh
+
+            # Step 4: set the process meshes of the outputs in while_op to the process meshes of the outside output nodes
+            while_op_outputs_dist_attrs = while_op_dist_attr.outputs_dist_attrs
+            for tensor_name, tensor_dist_attr in while_op_outputs_dist_attrs.items(
+            ):
+                nearest_tensor_node = _find_nearest_tensor_node_before(
+                    self._dist_context.serial_ordered_nodes, while_op_node_idx,
+                    tensor_name)
+                if nearest_tensor_node is None:
+                    nearest_tensor_node = _find_nearest_tensor_node_after(
+                        self._dist_context.serial_ordered_nodes,
+                        while_op_node_idx, tensor_name)
+                nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    nearest_tensor_node)
+                tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh
+
+        # Amend the process meshes related to array
+        for array_node_list in self._array_nodes.values():
+            merged_process_mesh = None
+            for array_node in array_node_list:
+                dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    array_node)
+                merged_process_mesh = merge_process_mesh_two(
+                    merged_process_mesh, dist_attr.process_mesh)
+            for array_node in array_node_list:
+                dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    array_node)
+                dist_attr.process_mesh = merged_process_mesh
+
+    def _update_process_mesh(self):
+        ordered_op_nodes = self._dist_context._serial_ordered_op_nodes
+
+        # Step 1: Set the annotated process meshes from tensors to the first ops using them
+        ordered_tensor_nodes = self._dist_context._serial_ordered_tensor_nodes
+        for tensor_node in ordered_tensor_nodes:
+            tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                tensor_node)
+            if not tensor_dist_attr.is_annotated("process_mesh"):
+                continue
+            first_op_node = None
+            for op_node in ordered_op_nodes:
+                # TODO: Need a better rule for the control flow ops.
+                # For now, do not set the process mesh of while_op from its inputs
+                if op_node.op().type() == "while":
+                    continue
+                for input_tensor_node in op_node.inputs:
+                    if _node_id(tensor_node) == _node_id(input_tensor_node):
+                        first_op_node = op_node
+                        break
+                if first_op_node is not None:
+                    break
+            if first_op_node is None:
+                continue
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                first_op_node)
+            if op_dist_attr is not None and not op_dist_attr.is_annotated(
+                    "process_mesh"):
+                compatible_process_mesh = compute_compatible_process_mesh(
+                    [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh])
+                if compatible_process_mesh is not None \
+                    and op_dist_attr.process_mesh != compatible_process_mesh:
+                    op_dist_attr.process_mesh = compatible_process_mesh
+
+        # Step 2: set the process meshes of ops with the nearest op before them
+        # Step 2.1: find the first op node which has the process mesh
+        idx_of_first_op_node_has_process_mesh = -1
+        for idx, op_node in enumerate(ordered_op_nodes):
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+            if op_dist_attr.process_mesh is not None \
+                and idx_of_first_op_node_has_process_mesh == -1:
+                idx_of_first_op_node_has_process_mesh = idx
+                # Reuse the following method to set the related tensors for same op node
+                self._update_process_mesh_by_nearest(op_node, op_node)
+        # Step 2.2: set the process meshes of ops by the nearest op node after the first op node
+        if idx_of_first_op_node_has_process_mesh + 1 > len(ordered_op_nodes):
+            return None
+        for idx, op_node in enumerate(ordered_op_nodes[
+                idx_of_first_op_node_has_process_mesh + 1:]):
+            original_idx = idx_of_first_op_node_has_process_mesh + +idx + 1
+            nearest_op_node = ordered_op_nodes[original_idx - 1]
+            nearest_op_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                nearest_op_node)
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+            assert nearest_op_dist_attr.process_mesh is not None
+            self._update_process_mesh_by_nearest(op_node, nearest_op_node)
+        # Step 2.3: set the process meshes of ops by the nearest op node before the first op node
+        nearest_op_node = ordered_op_nodes[
+            idx_of_first_op_node_has_process_mesh]
+        for op_node in ordered_op_nodes[:idx_of_first_op_node_has_process_mesh]:
+            self._update_process_mesh_by_nearest(op_node, nearest_op_node)
+
+        # Step 3: adjust the process meshes for special ops
+        self._update_process_mesh_for_specials()
+
+    def _prepare(self):
+        self._while_op_nodes = {}
+        self._array_nodes = {}
+        self._node_pairs_between_graphs = []
+        all_nodes = self._dist_context.serial_ordered_nodes
+        for idx, node in enumerate(all_nodes):
+            if node.is_op():
+                if node.op().type() == "while":
+                    self._while_op_nodes[_node_id(node)] = (node, idx)
+                if node.op().type() == "read_from_array":
+                    array_var_name = node.op().input("X")[0]
+                    if self._array_nodes.get(array_var_name, None) is None:
+                        self._array_nodes[array_var_name] = []
+                    self._array_nodes[array_var_name].append(node)
+                if node.op().type() == "write_to_array":
+                    array_var_name = node.op().output("Out")[0]
+                    if self._array_nodes.get(array_var_name, None) is None:
+                        self._array_nodes[array_var_name] = []
+                    self._array_nodes[array_var_name].append(node)
+                    self._array_nodes[array_var_name].append(node.outputs[0])
+            if node.is_var() and node.var() is not None:
+                if node.node.graph_id() != 0:
+                    for before_node in reversed(all_nodes[:idx]):
+                        if before_node.is_var() and before_node.var() is not None \
+                            and before_node.node.graph_id() == node.node.graph_id() - 1 \
+                                and before_node.var().name() == node.var().name():
+                            self._node_pairs_between_graphs.append(
+                                (before_node, node))
+                    for after_node in all_nodes[idx + 1:]:
+                        if after_node.is_var() and after_node.var() is not None \
+                            and after_node.node.graph_id() == node.node.graph_id() - 1 \
+                                and after_node.var().name() == node.var().name():
+                            self._node_pairs_between_graphs.append(
+                                (after_node, node))
+
     def complete_forward_annotation(self, serial_main_program):
         """ Complete annotation for the partial annotated serial_main_program.
         Arguments:
@@ -336,24 +633,24 @@ def complete_forward_annotation(self, serial_main_program):
 
         # Initialize distributed attributes for all var and op node in serial_main_program
         self._dist_context.init_dist_attr_for_program()
+        # print_program_with_dist_attr(serial_main_program, self._dist_context)
 
         # Initialize distributed attributes for all var and op node in graph
         self._dist_context.init_dist_attr_for_graph()
 
+        self._prepare()
+
         self._update_process_mesh()
 
-        # Complete dims_mapping for each node
         self._update_dims_mapping()
 
         # Copy the corresponding distributed attribute from graph to serial_main_program
         self._dist_context.copy_dist_attr_from_graph_to_program()
         self._dist_context.clear_dist_info_for_graph()
 
-        # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
         # Do the validation check and amend some completion
         self._dist_context.amend_dist_attr_for_program()
 
-        # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
         self._dist_context.validate_dist_attr_for_program()
 
         return serial_main_program
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index b27cd7a37c956..8ec702ffcb0b6 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -175,6 +175,7 @@ def __str__(self):
 class OperatorDistributedAttribute:
     def __init__(self):
         self._process_mesh = None
+        self._op_type = None
         self._impl_type = None
         self._impl_idx = None
         self._inputs_dist_attrs = {}
@@ -194,11 +195,23 @@ def process_mesh(self, process_mesh):
             if isinstance(process_mesh, list):
                 process_mesh = ProcessMesh(process_mesh)
             self._process_mesh = copy.deepcopy(process_mesh)
+            # In while op, the proess mesh is not shared by all inputs and outputs 
+            if self._op_type == "while":
+                return None
             for dist_attr in self._inputs_dist_attrs.values():
                 dist_attr.process_mesh = process_mesh
             for dist_attr in self._outputs_dist_attrs.values():
                 dist_attr.process_mesh = process_mesh
 
+    @property
+    def op_type(self):
+        return self._op_type
+
+    @op_type.setter
+    def op_type(self, op_type):
+        if op_type is not None:
+            self._op_type = op_type
+
     @property
     def impl_type(self):
         return self._impl_type
@@ -326,6 +339,8 @@ def init(self, dist_attr):
                     assert False, "No setter for {} in args {}.".format(
                         key, dist_attr)
         # Make sure proscess_meshes in dist op be same
+        if self.op_type == "while":
+            return None
         process_meshes = []
         process_meshes.append(self.process_mesh)
         for tensor_dist_attr in self.inputs_dist_attrs.values():
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 573f23fdca519..2807c46540ab1 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -15,6 +15,7 @@
 import copy
 from collections import defaultdict
 from paddle.fluid import framework
+from paddle.fluid.framework import get_flags, set_flags
 from paddle.fluid import core
 from .dist_attribute import TensorDistributedAttribute
 from .dist_attribute import OperatorDistributedAttribute
@@ -39,6 +40,10 @@ def set_default_distributed_context(dist_context):
     _g_default_distributed_context = dist_context
 
 
+def _node_id(node):
+    return (node.node.graph_id(), node.node.id())
+
+
 class DistributedContext:
     """
     DistributedContext is used to collect related distributed information for program and graph.
@@ -146,7 +151,7 @@ def get_dist_tensor_for_program(self, serial_tensor):
                 return None
 
     def get_dist_tensor_for_graph(self, serial_tensor_node):
-        serial_tensor_node_id = serial_tensor_node.id()
+        serial_tensor_node_id = _node_id(serial_tensor_node)
         return self._dist_tensors_for_graph.get(serial_tensor_node_id, None)
 
     def get_dist_op_for_program(self, serial_op):
@@ -168,7 +173,7 @@ def del_dist_op_for_program(self, serial_tensor):
             del self._dist_ops_for_program[serial_tensor_id]
 
     def get_dist_op_for_graph(self, serial_op_node):
-        serial_op_node_id = serial_op_node.id()
+        serial_op_node_id = _node_id(serial_op_node)
         return self._dist_ops_for_graph.get(serial_op_node_id, None)
 
     def get_tensor_dist_attr_for_program(self, serial_tensor):
@@ -197,7 +202,7 @@ def set_tensor_dist_attr_for_program(self, serial_tensor, dist_attr):
         self.add_dist_tensor_for_program(dist_tensor)
 
     def get_tensor_dist_attr_for_graph(self, serial_tensor_node):
-        serial_tensor_node_id = serial_tensor_node.id()
+        serial_tensor_node_id = _node_id(serial_tensor_node)
         dist_tensor = self._dist_tensors_for_graph.get(serial_tensor_node_id,
                                                        None)
         if dist_tensor:
@@ -242,7 +247,7 @@ def set_op_dist_attr_for_program(self, serial_op, dist_attr):
         self.add_dist_op_for_program(dist_op)
 
     def get_op_dist_attr_for_graph(self, serial_op_node):
-        serial_op_node_id = serial_op_node.id()
+        serial_op_node_id = _node_id(serial_op_node)
         dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
         if dist_op:
             return dist_op.dist_attr
@@ -262,7 +267,7 @@ def get_op_dist_attr_for_graph(self, serial_op_node):
 
     def get_dist_attr_for_graph(self, serial_node):
         if serial_node.is_var() and serial_node.var() is not None:
-            serial_tensor_node_id = serial_node.id()
+            serial_tensor_node_id = _node_id(serial_node)
             dist_tensor = self._dist_tensors_for_graph.get(
                 serial_tensor_node_id, None)
             if dist_tensor:
@@ -270,7 +275,7 @@ def get_dist_attr_for_graph(self, serial_node):
             else:
                 return None
         if serial_node.is_op() and serial_node.op() is not None:
-            serial_op_node_id = serial_node.id()
+            serial_op_node_id = _node_id(serial_node)
             dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
             if dist_op:
                 return dist_op.dist_attr
@@ -311,40 +316,69 @@ def init_dist_attr_for_program(self):
     def order_nodes_by_program_order(self):
         def _contains(nodes, target_node):
             for node in nodes:
-                if node.id() == target_node.id():
+                if _node_id(node) == _node_id(target_node):
                     return True
             return False
 
-        ordered_tensor_nodes = []
-        ordered_op_nodes = []
-        all_nodes = self._serial_graph.all_nodes()
+        serial_ordered_tensor_nodes = []
+        serial_ordered_op_nodes = []
+        all_nodes = []
+        # for idx, graph in enumerate(self._serial_graph.all_sub_graphs()):
+        for idx, graph in enumerate(self._serial_graph.all_sub_graphs()):
+            for node in graph.all_nodes():
+                all_nodes.append(node)
         for node in all_nodes:
             if node.is_var() and node.var() is not None:
-                ordered_tensor_nodes.append(node)
+                serial_ordered_tensor_nodes.append(node)
             if node.is_op() and node.op() is not None:
-                ordered_op_nodes.append(node)
-        ordered_tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
-        ordered_op_nodes.sort(key=lambda node: node.node.original_desc_id())
-        for op_node in ordered_op_nodes:
+                serial_ordered_op_nodes.append(node)
+        serial_ordered_tensor_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        serial_ordered_op_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        num_nodes_before = len(serial_ordered_tensor_nodes) + len(
+            serial_ordered_op_nodes)
+
+        new_serial_ordered_tensor_nodes = []
+        new_serial_ordered_op_nodes = []
+        for op_node in serial_ordered_op_nodes:
             tensor_nodes = []
             for tensor_node in op_node.inputs:
                 if tensor_node.is_var() \
                     and tensor_node.var() is not None \
                     and not _contains(self._serial_ordered_nodes, tensor_node):
                     tensor_nodes.append(tensor_node)
+                    new_serial_ordered_tensor_nodes.append(tensor_node)
             tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
             self._serial_ordered_nodes.extend(tensor_nodes)
             self._serial_ordered_nodes.append(op_node)
+            new_serial_ordered_op_nodes.append(op_node)
             tensor_nodes = []
             for tensor_node in op_node.outputs:
                 if tensor_node.is_var() \
                     and tensor_node.var() is not None \
                     and not _contains(self._serial_ordered_nodes, tensor_node):
                     tensor_nodes.append(tensor_node)
+                    new_serial_ordered_tensor_nodes.append(tensor_node)
+            tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
             self._serial_ordered_nodes.extend(tensor_nodes)
-        num_nodes_before = len(ordered_tensor_nodes) + len(ordered_op_nodes)
-        assert len(self._serial_ordered_nodes) == num_nodes_before, \
-            "The number of nodes before ordering is not the same after ordering."
+        new_serial_ordered_tensor_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        new_serial_ordered_op_nodes.sort(
+            key=lambda node: node.node.original_desc_id())
+        self._serial_ordered_tensor_nodes = new_serial_ordered_tensor_nodes
+        self._serial_ordered_op_nodes = new_serial_ordered_op_nodes
+        assert len(self._serial_ordered_nodes) == len(
+            self._serial_ordered_tensor_nodes) + len(
+                self._serial_ordered_op_nodes)
+        self._serial_orphan_tensor_nodes = []
+        for tensor_node in serial_ordered_tensor_nodes:
+            if not _contains(self._serial_ordered_tensor_nodes, tensor_node):
+                self._serial_orphan_tensor_nodes.append(tensor_node)
+        if len(self._serial_ordered_nodes) != num_nodes_before:
+            print(
+                "WARNING: there are some orphan tensors or ops which are not used in the execution."
+            )
 
     def init_dist_attr_for_graph(self):
         assert self._is_initialized_for_program, \
@@ -352,9 +386,9 @@ def init_dist_attr_for_graph(self):
         if self._is_initialized_for_graph:
             return
         # Convert program to graph
+        set_flags({"FLAGS_convert_all_blocks": True})
         self._serial_graph = framework.IrGraph(
             core.Graph(self._serial_program.desc))
-        all_nodes = self._serial_graph.all_nodes()
         self.order_nodes_by_program_order()
         for node in self.serial_ordered_nodes:
             if node.is_var() and node.var() is not None:
@@ -365,10 +399,11 @@ def init_dist_attr_for_graph(self):
                     if tensor_id == cur_tensor_id \
                         or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id():
                         dist_tensor = cur_dist_tensor
-                        self._node_id_to_tensor_id[node.id()] = cur_tensor_id
+                        self._node_id_to_tensor_id[_node_id(
+                            node)] = cur_tensor_id
                 assert dist_tensor is not None, \
                     "Tensor must have a distributed tensor after the initialization for program."
-                serial_tensor_node_id = node.id()
+                serial_tensor_node_id = _node_id(node)
                 new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor,
                                                     dist_tensor.dist_attr)
                 self._dist_tensors_for_graph[
@@ -381,10 +416,10 @@ def init_dist_attr_for_graph(self):
                     if op_id == cur_op_id \
                         or op_id == cur_dist_op.serial_op.desc.original_id():
                         dist_op = cur_dist_op
-                        self._node_id_to_op_id[node.id()] = cur_op_id
+                        self._node_id_to_op_id[_node_id(node)] = cur_op_id
                 assert dist_op is not None, \
                     "Operator must have a distributed operator after the initialization for program."
-                serial_op_node_id = node.id()
+                serial_op_node_id = _node_id(node)
                 new_dist_op = DistributedOperator(dist_op.serial_op,
                                                   dist_op.dist_attr)
                 self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
@@ -402,10 +437,11 @@ def copy_dist_attr_from_graph_to_program(self):
         assert self._is_initialized_for_program and self._is_initialized_for_graph, \
             "Both program and graph must be initialized."
         updated_tensors = {}
-        all_nodes = self._serial_graph.all_nodes()
+        # all_nodes = self._serial_graph.all_nodes()
+        all_nodes = self._serial_ordered_nodes
         for node in all_nodes:
             if node.is_var() and node.var() is not None:
-                tensor_id = self._node_id_to_tensor_id[node.id()]
+                tensor_id = self._node_id_to_tensor_id[_node_id(node)]
                 updated = updated_tensors.get(tensor_id, False)
                 # If a var has multiples var nodes in graph, only use the first one for now
                 if not updated:
@@ -416,16 +452,31 @@ def copy_dist_attr_from_graph_to_program(self):
                     dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph
                     updated_tensors[tensor_id] = True
             if node.is_op() and node.op() is not None:
-                op_id = self._node_id_to_op_id[node.id()]
+                op_id = self._node_id_to_op_id[_node_id(node)]
                 op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node)
                 dist_op_for_program = self._dist_ops_for_program[op_id]
                 dist_op_for_program.dist_attr = op_dist_attr_for_graph
+        # TODO: the completion algorithm will skip orphan tensors, 
+        # here we just set there process_mesh to the first one.
+        for orphan_node in self._serial_orphan_tensor_nodes:
+            serial_tensor_id = orphan_node.var().id()
+            dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id,
+                                                             None)
+            if dist_tensor:
+                dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
+            else:
+                serial_tensor_id = orphan_node.var().original_id()
+                dist_tensor = self._dist_tensors_for_program.get(
+                    serial_tensor_id, None)
+                dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
 
     def amend_dist_attr_for_program(self):
         for dist_tensor in self._dist_tensors_for_program.values():
             serial_tensor = dist_tensor.serial_tensor
             dist_attr = dist_tensor.dist_attr
-            if serial_tensor.type == core.VarDesc.VarType.READER:
+            if serial_tensor.type == core.VarDesc.VarType.READER \
+                or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                 tensor_shape = []
             else:
                 tensor_shape = serial_tensor.shape
@@ -446,6 +497,7 @@ def amend_dist_attr_for_program(self):
                     tensor_shape = []
                 else:
                     if dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.READER \
+                        or dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
                         or dist_op.serial_op.type == "create_py_reader":
                         tensor_shape = []
                     else:
@@ -459,8 +511,9 @@ def amend_dist_attr_for_program(self):
                         and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                         dims_mapping[i] = -1
             for arg_name in serial_op.output_arg_names:
-                if dist_op.get_serial_output(
-                        arg_name).type == core.VarDesc.VarType.READER:
+                if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \
+                    or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                    or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.STEP_SCOPES:
                     tensor_shape = []
                 else:
                     tensor_shape = dist_op.get_serial_output(arg_name).shape
@@ -498,7 +551,8 @@ def __deepcopy__(self, memo):
         for k, v in self.__dict__.items():
             if k == "_serial_program" or k == "_serial_graph" \
                 or k == "_dist_main_programs" or k == "_dist_startup_programs" \
-                or k == "_serial_ordered_nodes":
+                or k == "_serial_ordered_nodes" or k == "_serial_ordered_tensor_nodes" \
+                or k == "_serial_ordered_op_nodes":
                 setattr(result, k, v)
             else:
                 setattr(result, k, copy.deepcopy(v, memo))
diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py
index 67de298564afc..a2c2748a8cea3 100644
--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -76,7 +76,8 @@ def _init_default_dist_attr(self):
             if tensor is None:
                 tensor_shape = []
             else:
-                if tensor.type == core.VarDesc.VarType.READER:
+                if tensor.type == core.VarDesc.VarType.READER \
+                    or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
                     tensor_shape = []
                 else:
                     tensor_shape = tensor.shape
@@ -86,7 +87,9 @@ def _init_default_dist_attr(self):
                                                        tensor_dims_mapping)
         for tensor_name in self._serial_op.output_arg_names:
             tensor = self._serial_op.block._var_recursive(tensor_name)
-            if tensor.type == core.VarDesc.VarType.READER or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
+            if tensor.type == core.VarDesc.VarType.READER \
+                or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                 tensor_shape = []
             else:
                 tensor_shape = tensor.shape
@@ -95,6 +98,8 @@ def _init_default_dist_attr(self):
                 tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
                 self._dist_attr.set_output_dims_mapping(tensor_name,
                                                         tensor_dims_mapping)
+        if self._dist_attr.op_type is None:
+            self._dist_attr.op_type = self.serial_op.type
         if self._dist_attr.impl_type is None:
             self._dist_attr.impl_type = "default"
         if self._dist_attr.impl_idx is None:
@@ -134,12 +139,16 @@ def _filter_dist_attr(self, dist_attr):
         return new_dist_attr
 
     def validate_dist_attr(self):
-        if "read" in self.serial_op.type:
+        if "read" in self.serial_op.type or "while" == self.serial_op.type:
             return True
         for name in self.serial_op.input_arg_names:
             input_dist_attr = self.dist_attr.get_input_dist_attr(name)
             dims_mapping = input_dist_attr.dims_mapping
-            shape = self.get_serial_input(name).shape
+            if self.get_serial_input(
+                    name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+                shape = []
+            else:
+                shape = self.get_serial_input(name).shape
             if len(shape) != len(dims_mapping):
                 return False
             for i in range(len(dims_mapping)):
@@ -155,7 +164,11 @@ def validate_dist_attr(self):
         for name in self.serial_op.output_arg_names:
             output_dist_attr = self.dist_attr.get_output_dist_attr(name)
             dims_mapping = output_dist_attr.dims_mapping
-            shape = self.get_serial_output(name).shape
+            if self.get_serial_output(name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY\
+                or self.get_serial_output(name).type == core.VarDesc.VarType.STEP_SCOPES:
+                shape = []
+            else:
+                shape = self.get_serial_output(name).shape
             if len(shape) != len(dims_mapping):
                 return False
             for i in range(len(dims_mapping)):
@@ -241,14 +254,14 @@ def __init__(self, serial_module, dist_attr=None):
 
     def __call__(self, *args, **kwargs):
         from .dist_context import get_default_distributed_context
-        main_prog = paddle.fluid.default_main_program()
-        main_block = main_prog.global_block()
-        op_size = len(main_block.ops)
+        default_prog = paddle.fluid.default_main_program()
+        cur_block = default_prog.current_block()
+        op_size = len(cur_block.ops)
         output = self._serial_module(*args, **kwargs)
-        new_op_size = len(main_block.ops)
+        new_op_size = len(cur_block.ops)
         default_dist_ctx = get_default_distributed_context()
         for idx in range(op_size, new_op_size):
-            op = main_block.ops[idx]
+            op = cur_block.ops[idx]
             dist_op = DistributedOperator(op, self._dist_attr)
             dist_op.dist_attr.mark_annotated_as(self._dist_attr)
             default_dist_ctx.add_dist_op_for_program(dist_op)
diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py
index 5e3c852699ab6..a42ce863492b3 100644
--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -184,7 +184,9 @@ def dist_attr(self, dist_attr):
 
     def _init_default_dist_attr(self):
         if self._dist_attr.dims_mapping is None:
-            if self.serial_tensor.type == core.VarDesc.VarType.READER:
+            if self.serial_tensor.type == core.VarDesc.VarType.READER \
+                or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
                 tensor_shape = []
             else:
                 tensor_shape = self._serial_tensor.shape
@@ -192,7 +194,9 @@ def _init_default_dist_attr(self):
             self._dist_attr.dims_mapping = tensor_dims_mapping
 
     def validate_dist_attr(self):
-        if self.serial_tensor.type == core.VarDesc.VarType.READER:
+        if self.serial_tensor.type == core.VarDesc.VarType.READER \
+            or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+            or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
             return True
         tensor_shape = self.serial_tensor.shape
         if len(tensor_shape) != len(self.dist_attr.dims_mapping):
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 56beb8957415d..6bd1c5527a99e 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -259,7 +259,7 @@ def fit(self, train_data, batch_size=1, epochs=1, steps_per_epoch=1000):
                     "train_" + name: val
                     for name, val in logs.items()
                 }
-                self._logger.info(logs)
+                self._logger.info(train_logs)
 
     def _train_step(self, data):
         logs = {}
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 4b079e7b6b575..47f76353e4655 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -17,7 +17,9 @@
 
 _g_distributed_operator_impl_containers = {}
 
-_g_elementwise_ops = ["elementwise_add", "gelu", "dropout", "cast"]
+_g_elementwise_ops = [
+    "elementwise_add", "gelu", "dropout", "cast", "gather", "concat"
+]
 BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'}
 
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 4e977007261a7..de6d018d60521 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -55,9 +55,14 @@ def is_input_compatible(self, dist_op):
         op_dist_attr = dist_op.dist_attr
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
-            if serial_tensor.is_parameter:
-                continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+            if serial_tensor.is_parameter:
+                for mapping in dims_mapping:
+                    if mapping != -1:
+                        return False
+                # continue
+                # if len(dims_mapping) < 1:
+                #     continue
             if len(dims_mapping) > 1:
                 for mapping in dims_mapping[1:]:
                     if mapping != -1:
@@ -73,9 +78,14 @@ def is_output_compatible(self, dist_op):
             xshape_arg_names = op_desc.output("XShape")
         for arg_name in op_desc.output_arg_names():
             serial_tensor = dist_op.get_serial_output(arg_name)
-            if serial_tensor.is_parameter:
-                continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+            if serial_tensor.is_parameter:
+                for mapping in dims_mapping:
+                    if mapping != -1:
+                        return False
+                # continue
+                # if len(dims_mapping) < 1:
+                #     continue
             if arg_name not in xshape_arg_names:
                 if len(dims_mapping) > 1:
                     for mapping in dims_mapping[1:]:
@@ -104,7 +114,8 @@ def is_auto_compatible(self, dist_op):
                 for mapping in dims_mapping[1:]:
                     if mapping != -1:
                         return False
-            batch_dim_mappings.append(dims_mapping[0])
+            if len(dims_mapping) >= 1:
+                batch_dim_mappings.append(dims_mapping[0])
 
         # Check output compatibility
         output_names = op_desc.output_names()
@@ -121,7 +132,8 @@ def is_auto_compatible(self, dist_op):
                     for mapping in dims_mapping[1:]:
                         if mapping != -1:
                             return False
-                batch_dim_mappings.append(dims_mapping[0])
+                if len(dims_mapping) >= 1:
+                    batch_dim_mappings.append(dims_mapping[0])
             else:
                 if dims_mapping[0] != -1:
                     return False
@@ -129,7 +141,8 @@ def is_auto_compatible(self, dist_op):
                     for mapping in dims_mapping[2:]:
                         if mapping != -1:
                             return False
-                batch_dim_mappings.append(dims_mapping[1])
+                if len(dims_mapping) >= 2:
+                    batch_dim_mappings.append(dims_mapping[1])
 
         # Check batch dim mapping compatibility
         if not all(batch_dim_mappings[0] == dim_mapping
@@ -143,7 +156,9 @@ def update_dims_mapping(self, dist_op):
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
         # The following statement will be replaced by a more elegent way
-        if op_desc.type() == "shape" or op_desc.type() == "slice":
+        if op_desc.type() == "shape" \
+            or op_desc.type() == "slice" \
+                or op_desc.type() == "while":
             return False
         output_names = op_desc.output_names()
         xshape_arg_names = []
@@ -155,17 +170,22 @@ def update_dims_mapping(self, dist_op):
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            batch_dim_mappings.append(dims_mapping[0])
+            if len(dims_mapping) >= 1:
+                batch_dim_mappings.append(dims_mapping[0])
         for arg_name in op_desc.output_arg_names():
             serial_tensor = dist_op.get_serial_output(arg_name)
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
             if arg_name not in xshape_arg_names:
-                batch_dim_mappings.append(dims_mapping[0])
+                if len(dims_mapping) >= 1:
+                    batch_dim_mappings.append(dims_mapping[0])
             else:
                 batch_dim_mappings.append(dims_mapping[1])
 
+        if not batch_dim_mappings:
+            return changed
+
         compatible_dim_mapping = compute_compatible_dim_mapping(
             batch_dim_mappings)
         assert compatible_dim_mapping is not None, "There is no compatible dim mapping."
@@ -174,7 +194,8 @@ def update_dims_mapping(self, dist_op):
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if compatible_dim_mapping != dims_mapping[0]:
+            if len(dims_mapping
+                   ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
                 dims_mapping[0] = compatible_dim_mapping
                 changed = True
         for arg_name in op_desc.output_arg_names():
@@ -183,11 +204,13 @@ def update_dims_mapping(self, dist_op):
                 continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
             if arg_name not in xshape_arg_names:
-                if compatible_dim_mapping != dims_mapping[0]:
+                if len(dims_mapping
+                       ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
                     dims_mapping[0] = compatible_dim_mapping
                     changed = True
             else:
-                if compatible_dim_mapping != dims_mapping[1]:
+                if len(dims_mapping
+                       ) >= 2 and compatible_dim_mapping != dims_mapping[1]:
                     dims_mapping[1] = compatible_dim_mapping
                     changed = True
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 058ae1d0a9fd5..684db52a28d83 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -1432,7 +1432,6 @@ def is_input_compatible(self, dist_op):
         if is_valid_list_index(y_dims_mapping,
                                -2) and is_dim_shard(y_dims_mapping[-2]):
             return False
-
         return True
 
     def is_output_compatible(self, dist_op):
@@ -1483,3 +1482,512 @@ def backward(ctx, *args, **kwargs):
                                    DistributedMatmulV2Impl1("row_parallel"))
 register_distributed_operator_impl(
     "matmul_v2", DistributedMatmulV2Impl2("replicate_parallel"))
+
+
+class DistributedMul(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedMul, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(DistributedMul("mul"))
+
+
+# ColumnParallel
+class DistributedMulImpl0(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedMulImpl0, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+        if is_dim_shard(y_dims_mapping[-2]) or is_dim_replicate(y_dims_mapping[
+                -1]):
+            return False
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        if is_dim_replicate(out_dims_mapping[-1]):
+            return False
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        if not _is_auto_compatible_for_matmul(dist_op):
+            return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
+        if dim_changed:
+            changed = True
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        kwargs: inputname_mapping & outputname_mapping
+        """
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
+            str(src_op))
+
+        # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
+                                              rank_id)
+
+        # check validation of inputs / outputs
+        for input_name in src_op.desc.input_names():
+            assert input_name in kwargs, "input [{}] is not given".format(
+                input_name)
+            assert len(kwargs[input_name]) == len(
+                src_op.desc.input(input_name)
+            ), "number of tensor for input [{}] is not match".format(input_name)
+        for output_name in src_op.desc.output_names():
+            assert output_name in kwargs, "input [{}] is not given".format(
+                output_name)
+            assert len(kwargs[output_name]) == len(
+                src_op.desc.output(output_name)
+            ), "number of tensor for input [{}] is not match".format(
+                output_name)
+
+        X_var = main_block.var(kwargs['X'][0])
+        Weight_var = main_block._var_recursive(kwargs['Y'][0])
+        Out_var = main_block.var(kwargs['Out'][0])
+
+        # TODO infer logic comm presentation
+        matmul_col_dim_mapping = op_dist_attr.get_input_dims_mapping(
+            Weight_var.name)[-1]
+        assert matmul_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
+            matmul_col_dim_mapping)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
+
+        parallel_axis = matmul_col_dim_mapping
+        group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
+                                      parallel_axis, rank_id)
+        group = new_process_group(group_ranks)
+
+        # infer new var shape with op dist attr
+        x_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(X_var)
+        assert x_tensor_dist_attr is not None
+        identity_var_dist_attr = op_dist_attr.get_input_dist_attr(X_var.name)
+        assert identity_var_dist_attr is not None
+        ref_shape_x = infer_shape(main_block, X_var, x_tensor_dist_attr,
+                                  identity_var_dist_attr)
+        # infer out var shape with op dist attr
+        out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
+        assert out_tensor_dist_attr is not None
+        out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
+        assert out_var_dist_attr is not None
+        ref_shape_out = infer_shape(main_block, Out_var, out_tensor_dist_attr,
+                                    out_var_dist_attr)
+
+        intermediate_var_0 = main_block.create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                ["c_identity", 'tmp'])),
+            dtype=X_var.dtype,
+            shape=X_var.shape,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=X_var.stop_gradient)
+        # set intermediate_var_0's dist_attr with X_var's dist_attr
+        ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
+                                             identity_var_dist_attr)
+
+        check_variable_and_dtype(
+            X_var, 'tensor',
+            ['float16', 'float32', 'float64', 'int32', 'int64'], '_c_identity')
+        c_identity_op = main_block.append_op(
+            type='c_identity',
+            inputs={'X': [X_var]},
+            outputs={'Out': intermediate_var_0},
+            attrs={
+                'ring_id': group.id,
+                'use_calc_stream': True,
+                'use_model_parallel': True,
+            })
+        if intermediate_var_0.shape != ref_shape_x:
+            intermediate_var_0.desc.set_shape(ref_shape_x)
+
+        check_variable_and_dtype(intermediate_var_0, 'x',
+                                 ['float16', 'float32', 'float64'], 'linear')
+        check_dtype(intermediate_var_0.dtype, 'dtype',
+                    ['float16', 'float32', 'float64'], 'linear')
+        # attrs = {'trans_x': False, 'trans_y': False}
+        attrs = {
+            "x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
+            "y_num_col_dims": src_op.desc.attr("y_num_col_dims")
+        }
+        inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]}
+        mul_op = main_block.append_op(
+            type='mul', inputs=inputs, outputs={'Out': Out_var}, attrs=attrs)
+        if Out_var.shape != ref_shape_out:
+            Out_var.desc.set_shape(ref_shape_out)
+
+        # set dist op's dist_attr with serial op's dist_attr
+        # c_identity
+        identity_op_dist_attr = OperatorDistributedAttribute()
+        identity_op_dist_attr.process_mesh = op_dist_attr.process_mesh
+        identity_op_dist_attr.impl_type = op_dist_attr.impl_type
+        identity_op_dist_attr.impl_idx = op_dist_attr.impl_idx
+        # input
+        input_varname = c_identity_op.desc.input_arg_names()[0]
+        input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
+        assert input_dist_attr is not None, "dist_attr is {}".format(
+            op_dist_attr)
+        identity_op_dist_attr.set_input_dist_attr(input_varname,
+                                                  input_dist_attr)
+        # output
+        output_varname = c_identity_op.desc.output_arg_names()[0]
+        identity_op_dist_attr.set_output_dist_attr(output_varname,
+                                                   input_dist_attr)
+        ctx.set_op_dist_attr_for_program(c_identity_op, identity_op_dist_attr)
+
+        # matmulv2
+        matmulv2_op_dist_attr = OperatorDistributedAttribute()
+        matmulv2_op_dist_attr.process_mesh = op_dist_attr.process_mesh
+        matmulv2_op_dist_attr.impl_type = op_dist_attr.impl_type
+        matmulv2_op_dist_attr.impl_idx = op_dist_attr.impl_idx
+        for input_varname in mul_op.desc.input_arg_names():
+            if input_varname in src_op.desc.input_arg_names():
+                input_dist_attr = op_dist_attr.get_input_dist_attr(
+                    input_varname)
+                assert input_dist_attr is not None, "dist_attr is {}".format(
+                    op_dist_attr)
+                matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
+                                                          input_dist_attr)
+            else:
+                input_var = main_block.var(input_varname)
+                tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(
+                    input_var)
+                matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
+                                                          tensor_dist_attr)
+        for output_varname in mul_op.desc.output_arg_names():
+            output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
+            assert output_dist_attr is not None, "dist_attr is {}".format(
+                op_dist_attr)
+            matmulv2_op_dist_attr.set_output_dist_attr(output_varname,
+                                                       output_dist_attr)
+        ctx.set_op_dist_attr_for_program(mul_op, matmulv2_op_dist_attr)
+
+        # init param sync
+        if Weight_var.is_parameter and not op_dist_attr.is_recompute:
+            _init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
+                             rank_id)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        _right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
+
+
+# RowParallel
+class DistributedMulImpl1(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedMulImpl1, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        if is_dim_replicate(x_dims_mapping[-1]):
+            return False
+        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
+                -1]):
+            return False
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        if not _is_auto_compatible_for_matmul(dist_op):
+            return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
+        if dim_changed:
+            changed = True
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        kwargs: inputname_mapping & outputname_mapping
+        """
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
+            str(src_op))
+
+        # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
+                                              rank_id)
+
+        # check validation of inputs / outputs
+        for input_name in src_op.desc.input_names():
+            assert input_name in kwargs, "input [{}] is not given".format(
+                input_name)
+            assert len(kwargs[input_name]) == len(
+                src_op.desc.input(input_name)
+            ), "number of tensor for input [{}] is not match".format(input_name)
+        for output_name in src_op.desc.output_names():
+            assert output_name in kwargs, "input [{}] is not given".format(
+                output_name)
+            assert len(kwargs[output_name]) == len(
+                src_op.desc.output(output_name)
+            ), "number of tensor for input [{}] is not match".format(
+                output_name)
+
+        X_var = main_block.var(kwargs['X'][0])
+        Weight_var = main_block._var_recursive(kwargs['Y'][0])
+        Out_var = main_block.var(kwargs['Out'][0])
+
+        # TODO infer logic comm presentation
+        matmul_row_dim_mapping = op_dist_attr.get_input_dims_mapping(
+            Weight_var.name)[-2]
+        assert matmul_row_dim_mapping >= 0, "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
+            matmul_row_dim_mapping)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
+
+        parallel_axis = matmul_row_dim_mapping
+        group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
+                                      parallel_axis, rank_id)
+        group = new_process_group(group_ranks)
+
+        check_variable_and_dtype(X_var, 'x', ['float16', 'float32', 'float64'],
+                                 'linear')
+        check_dtype(X_var.dtype, 'dtype', ['float16', 'float32', 'float64'],
+                    'linear')
+        # attrs = {'trans_x': False, 'trans_y': False}
+        attrs = {
+            "x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
+            "y_num_col_dims": src_op.desc.attr("y_num_col_dims")
+        }
+        inputs = {'X': X_var, 'Y': Weight_var}
+
+        # infer out var shape with op dist attr
+        out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
+        assert out_tensor_dist_attr is not None
+        out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
+        assert out_var_dist_attr is not None
+        ref_shape = infer_shape(main_block, Out_var, out_tensor_dist_attr,
+                                out_var_dist_attr)
+
+        intermediate_var_0 = main_block.create_var(
+            shape=Out_var.shape,
+            dtype=Out_var.dtype,
+            type=Out_var.type,
+            lod_level=Out_var.lod_level,
+            persistable=False,
+            is_data=False,
+            need_check_feed=Out_var.desc.need_check_feed())
+        # set intermediate_var_0's dist_attr with Out_var's dist_attr
+        ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
+                                             out_var_dist_attr)
+
+        mul_op = main_block.append_op(
+            type='mul',
+            inputs=inputs,
+            outputs={'Out': intermediate_var_0},
+            attrs=attrs)
+        if intermediate_var_0.shape != ref_shape:
+            intermediate_var_0.desc.set_shape(ref_shape)
+
+        c_allreduce_sum_op = main_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': intermediate_var_0},
+            outputs={'Out': Out_var},
+            attrs={
+                'ring_id': group.id,
+                'use_calc_stream': True,
+                'use_model_parallel': True
+            })
+        if Out_var.shape != ref_shape:
+            Out_var.desc.set_shape(ref_shape)
+
+        # set dist op's dist_attr with serial op's dist_attr
+        # matmulv2
+        matmulv2_op_dist_attr = OperatorDistributedAttribute()
+        matmulv2_op_dist_attr.process_mesh = op_dist_attr.process_mesh
+        matmulv2_op_dist_attr.impl_type = op_dist_attr.impl_type
+        matmulv2_op_dist_attr.impl_idx = op_dist_attr.impl_idx
+        for input_varname in mul_op.desc.input_arg_names():
+            input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
+            assert input_dist_attr is not None, "dist_attr is {}".format(
+                op_dist_attr)
+            matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
+                                                      input_dist_attr)
+        output_varname = mul_op.desc.output_arg_names()[0]
+        output_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
+        assert output_dist_attr is not None, "dist_attr is {}".format(
+            op_dist_attr)
+        matmulv2_op_dist_attr.set_output_dist_attr(output_varname,
+                                                   output_dist_attr)
+        ctx.set_op_dist_attr_for_program(mul_op, matmulv2_op_dist_attr)
+
+        # allreduce
+        allreduce_op_dist_attr = OperatorDistributedAttribute()
+        allreduce_op_dist_attr.process_mesh = op_dist_attr.process_mesh
+        allreduce_op_dist_attr.impl_type = op_dist_attr.impl_type
+        allreduce_op_dist_attr.impl_idx = op_dist_attr.impl_idx
+        for input_varname in c_allreduce_sum_op.desc.input_arg_names():
+            input_var = main_block.var(input_varname)
+            tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(input_var)
+            assert tensor_dist_attr is not None
+            allreduce_op_dist_attr.set_input_dist_attr(input_varname,
+                                                       tensor_dist_attr)
+        for output_varname in c_allreduce_sum_op.desc.output_arg_names():
+            output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
+            assert output_dist_attr is not None, "dist_attr is {}".format(
+                op_dist_attr)
+            allreduce_op_dist_attr.set_output_dist_attr(output_varname,
+                                                        output_dist_attr)
+        ctx.set_op_dist_attr_for_program(c_allreduce_sum_op,
+                                         allreduce_op_dist_attr)
+
+        # init param sync
+        if Weight_var.is_parameter and not op_dist_attr.is_recompute:
+            _init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
+                             rank_id)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        _right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
+
+
+# ReplicateParallel
+class DistributedMulImpl2(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedMulImpl2, self).__init__(name)
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+        if is_valid_list_index(x_dims_mapping,
+                               -2) and is_dim_shard(x_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(y_dims_mapping[-1]):
+            return False
+        if is_valid_list_index(y_dims_mapping,
+                               -2) and is_dim_shard(y_dims_mapping[-2]):
+            return False
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+        if is_valid_list_index(out_dims_mapping,
+                               -2) and is_dim_shard(out_dims_mapping[-2]):
+            return False
+
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        if not _is_auto_compatible_for_matmul(dist_op):
+            return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
+        if dim_changed:
+            changed = True
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        _right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl("mul",
+                                   DistributedMulImpl0("column_parallel"))
+register_distributed_operator_impl("mul", DistributedMulImpl1("row_parallel"))
+register_distributed_operator_impl("mul",
+                                   DistributedMulImpl2("replicate_parallel"))
diff --git a/python/paddle/distributed/auto_parallel/tuner/__init__.py b/python/paddle/distributed/auto_parallel/tuner/__init__.py
new file mode 100644
index 0000000000000..513558501a0eb
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/auto_parallel/tuner/recorder.py b/python/paddle/distributed/auto_parallel/tuner/recorder.py
new file mode 100644
index 0000000000000..140336566a146
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/recorder.py
@@ -0,0 +1,214 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+class MetricRecord(object):
+    """
+    One record for a single metric at a given execution step.
+    """
+
+    def __init__(self, value, step):
+        self._value = value
+        self._step = step
+
+    @property
+    def value(self):
+        return self._value
+
+    @value.setter
+    def value(self, value):
+        self._value = value
+
+    @property
+    def step(self):
+        return self._step
+
+    @step.setter
+    def step(self, step):
+        self._step = step
+
+    def mean(self):
+        return np.mean(self.value)
+
+    def get_state(self):
+        return {"value": self.value, "step": self.step}
+
+    @classmethod
+    def from_state(cls, state):
+        return cls(**state)
+
+    def __eq__(self, other):
+        if not isinstance(other, MetricRecord):
+            return False
+        return other.value == self.value and other.step == self.step
+
+    def __repr__(self):
+        return "MetricRecord(value={}, step={})".format(self.value, self.step)
+
+
+class MetricRecords(object):
+    """
+    Records of a single metric across different executions.
+    """
+
+    def __init__(self, direction="min"):
+        if direction not in {"min", "max"}:
+            raise ValueError(
+                "direction should be one of {min, max}, but got: {}.".format(
+                    direction))
+        self._direction = direction
+        self._records = {}
+
+    @property
+    def records(self):
+        return sorted(self._records.values(), key=lambda r: r.step)
+
+    @records.setter
+    def records(self, records):
+        for r in records:
+            self.update(r.value, step=r.step)
+
+    @property
+    def direction(self):
+        return self._direction
+
+    @direction.setter
+    def direction(self, direction):
+        self._direction = direction
+
+    def update(self, value, step=0):
+        if step in self._records:
+            self._records[step].set_value(value)
+        else:
+            self._records[step] = MetricRecord(value, step=step)
+
+    def get_best_value(self):
+        values = list(r.mean() for r in self._records.values())
+        if not values:
+            return None
+        if self._direction == "min":
+            return np.nanmin(values)
+        return np.nanmax(values)
+
+    def get_best_step(self):
+        best_value = self.get_best_value()
+        if best_value is None:
+            return None
+        for r in self._records.values():
+            if r.mean() == best_value:
+                return r.step
+
+    def get_statistics(self):
+        records = self.records
+        records_values = [r.mean() for r in records]
+        if not len(records_values):
+            return {}
+        return {
+            "min": float(np.nanmin(records_values)),
+            "max": float(np.nanmax(records_values)),
+            "mean": float(np.nanmean(records_values)),
+            "median": float(np.nanmedian(records_values)),
+            "var": float(np.nanvar(records_values)),
+            "std": float(np.nanstd(records_values)),
+        }
+
+    def get_state(self):
+        state = {}
+        state["direction"] = self._direction
+        state["records"] = [r.get_state() for r in self.records]
+        return state
+
+    @classmethod
+    def from_state(cls, state):
+        records = cls(state["direction"])
+        records.records = [MetricRecord.from_state(r) for r in state["records"]]
+        print("here 1", records.records)
+        return records
+
+
+class MetricsRecorder(object):
+    """
+    Record the values for all metrics.
+    """
+
+    def __init__(self, metrics=None):
+        self._records = {}
+        self.register_metrics(metrics)
+
+    @property
+    def records(self):
+        return self._records
+
+    def exists(self, name):
+        return name in self._records
+
+    def register_metrics(self, metrics=None):
+        metrics = metrics or []
+        for metric in metrics:
+            self.register(metric.name)
+
+    def register(self, name, direction=None):
+        if self.exists(name):
+            raise ValueError("Metric {} have been registered.".format(name))
+        if direction is None:
+            direction = "min"
+        self._records[name] = MetricRecords(direction)
+
+    def update(self, name, value, step=0):
+        value = float(value)
+        if not self.exists(name):
+            self.register(name)
+
+        prev_best = self._records[name].get_best_value()
+        self._records[name].update(value, step=step)
+        new_best = self._records[name].get_best_value()
+
+        improved = new_best != prev_best
+        return improved
+
+    def get_records(self, name):
+        return self._records[name].records
+
+    def set_records(self, name, records):
+        if not self.exists(name):
+            self.register(name)
+        self._records[name].records = records
+
+    def get_best_value(self, name):
+        return self._records[name].get_best_value()
+
+    def get_best_step(self, name):
+        return self._records[name].get_best_step()
+
+    def get_statistics(self, name):
+        return self._records[name].get_statistics()
+
+    def get_state(self):
+        return {
+            "metrics": {
+                name: metric_records.get_state()
+                for name, metric_records in self._records.items()
+            }
+        }
+
+    @classmethod
+    def from_state(cls, state):
+        recorder = cls()
+        recorder._records = {
+            name: MetricRecords.from_state(metric_records)
+            for name, metric_records in state["metrics"].items()
+        }
+        return recorder
diff --git a/python/paddle/distributed/auto_parallel/tuner/storable.py b/python/paddle/distributed/auto_parallel/tuner/storable.py
new file mode 100644
index 0000000000000..d61e53a027240
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/storable.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+
+class Storable(object):
+    def get_state(self):
+        raise NotImplementedError
+
+    def set_state(self, state):
+        raise NotImplementedError
+
+    def save(self, path):
+        state = self.get_state()
+        state_json = json.dumps(state)
+        with open(path, "w") as f:
+            f.write(state_json)
+        return str(path)
+
+    def load(self, path):
+        with open(path, "r") as f:
+            state_data = f.read()
+        state = json.loads(state_data)
+        self.set_state(state)
diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/tuner/trial.py
new file mode 100644
index 0000000000000..22a6638c5ca63
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/trial.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+import random
+import time
+from enum import Enum
+
+from .storable import Storable
+from .recorder import MetricsRecorder
+from .tunable_space import TunableSpace
+
+
+class TrialStatus:
+    RUNNING = "RUNNING"
+    COMPLETED = "COMPLETED"
+    STOPPED = "STOPPED"
+    INVALID = "INVALID"
+
+
+class Trial(Storable):
+    def __init__(self, tunable_space, trial_id=None,
+                 status=TrialStatus.RUNNING):
+        self._id = _generate_trial_id() if trial_id is None else trial_id
+        self._space = tunable_space
+        self._recorder = MetricsRecorder()
+        self._score = None
+        self._best_step = None
+        self._status = status
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def space(self):
+        return self._space
+
+    @property
+    def recorder(self):
+        return self._recorder
+
+    @property
+    def score(self):
+        return self._score
+
+    @score.setter
+    def score(self, score):
+        self._score = score
+
+    @property
+    def best_step(self):
+        return self._best_step
+
+    @best_step.setter
+    def best_step(self, best_step):
+        self._best_step = best_step
+
+    @property
+    def status(self):
+        return self._status
+
+    @status.setter
+    def status(self, status):
+        self._status = status
+
+    def summary(self):
+        print("Tunable space:")
+        if self.space.values:
+            for tv, value in self.space.values.items():
+                print(tv + ":", value)
+
+        if self.score is not None:
+            print("Score: {}".format(self.score))
+
+    def get_state(self):
+        return {
+            "id": self.id,
+            "space": self.space.get_state(),
+            "recorder": self.recorder.get_state(),
+            "score": self.score,
+            "best_step": self.best_step,
+            "status": self.status,
+        }
+
+    def set_state(self, state):
+        self._id = state["id"]
+        self._space = TunableSpace.from_state(state["space"])
+        self._recorder = MetricsRecorder.from_state(state["recorder"])
+        self._score = state["score"]
+        self._best_step = state["best_step"]
+        self._status = state["status"]
+
+    @classmethod
+    def from_state(cls, state):
+        trial = cls(tunable_space=None)
+        trial.set_state(state)
+        return trial
+
+
+def _generate_trial_id():
+    s = str(time.time()) + str(random.randint(1, int(1e7)))
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()[:32]
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
new file mode 100644
index 0000000000000..f63364c5b75ef
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
@@ -0,0 +1,151 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import contextlib
+import copy
+import math
+import random
+import numpy as np
+
+from .tunable_variable import Boolean
+from .tunable_variable import Fixed
+from .tunable_variable import Choice
+from .tunable_variable import IntRange
+from .tunable_variable import FloatRange
+
+
+class TunableSpace(object):
+    """
+    A TunableSpace is constructed by the tunable variables.
+    """
+
+    def __init__(self):
+        # Tunable variables for this tunable variables
+        self._variables = {}
+        # Specific values coresponding to each tunable variable
+        self._values = {}
+
+    @property
+    def variables(self):
+        return self._variables
+
+    @property
+    def values(self):
+        return self._values
+
+    def get_value(self, name):
+        if name in self.values:
+            return self.values[name]
+        else:
+            raise KeyError("{} does not exist.".format(name))
+
+    def set_value(self, name, value):
+        if name in self.values:
+            self.values[name] = value
+        else:
+            raise KeyError("{} does not exist.".format(name))
+
+    def _exists(self, name):
+        if name in self._variables:
+            return True
+        return False
+
+    def _retrieve(self, tv):
+        tv = tv.__class__.from_state(tv.get_state())
+        if self._exists(tv.name):
+            return self.get_value(tv.name)
+        return self._register(tv)
+
+    def _register(self, tv):
+        self._variables[tv.name] = tv
+        if tv.name not in self.values:
+            self.values[tv.name] = tv.default
+        return self.values[tv.name]
+
+    def __getitem__(self, name):
+        return self.get_value(name)
+
+    def __setitem__(self, name, value):
+        self.set_value(name, value)
+
+    def __contains__(self, name):
+        try:
+            self.get_value(name)
+            return True
+        except (KeyError, ValueError):
+            return False
+
+    def fixed(self, name, default):
+        tv = Fixed(name=name, default=default)
+        return self._retrieve(tv)
+
+    def boolean(self, name, default=False):
+        tv = Boolean(name=name, default=default)
+        return self._retrieve(tv)
+
+    def choice(self, name, values, default=None):
+        tv = Choice(name=name, values=values, default=default)
+        return self._retrieve(tv)
+
+    def int_range(self, name, start, stop, step=1, default=None):
+        tv = IntRange(
+            name=name, start=start, stop=stop, step=step, default=default)
+        return self._retrieve(tv)
+
+    def float_range(self, name, start, stop, step=None, default=None):
+        tv = FloatRange(
+            name=name, start=start, stop=stop, step=step, default=default)
+        return self._retrieve(tv)
+
+    def get_state(self):
+        return {
+            "variables": [{
+                "class_name": v.__class__.__name__,
+                "state": v.get_state()
+            } for v in self._variables.values()],
+            "values": dict((k, v) for (k, v) in self.values.items())
+        }
+
+    @classmethod
+    def from_state(cls, state):
+        ts = cls()
+        for v in state["variables"]:
+            v = _deserialize_tunable_variable(v)
+            ts._variables[v.name] = v
+        ts._values = dict((k, v) for (k, v) in state["values"].items())
+        return ts
+
+
+def _deserialize_tunable_variable(state):
+    classes = (Boolean, Fixed, Choice, IntRange, FloatRange)
+    cls_name_to_cls = {cls.__name__: cls for cls in classes}
+
+    if isinstance(state, classes):
+        return state
+
+    if (not isinstance(state, dict) or "class_name" not in state or
+            "state" not in state):
+        raise ValueError(
+            "Expect state to be a python dict containing class_name and state as keys, but found {}"
+            .format(state))
+
+    cls_name = state["class_name"]
+    cls = cls_name_to_cls[cls_name]
+    if cls is None:
+        raise ValueError("Unknown class name {}".format(cls_name))
+
+    cls_state = state["state"]
+    deserialized_object = cls.from_state(cls_state)
+    return deserialized_object
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
new file mode 100644
index 0000000000000..9549b44c48ecb
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
@@ -0,0 +1,242 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+class TunableVariable(object):
+    """
+    Tunablevariable base class.
+    """
+
+    def __init__(self, name, default=None):
+        self.name = name
+        self._default = default
+
+    @property
+    def default(self):
+        return self._default
+
+    def get_state(self):
+        return {"name": self.name, "default": self.default}
+
+    @classmethod
+    def from_state(cls, state):
+        return cls(**state)
+
+
+class Fixed(TunableVariable):
+    """
+    Fixed variable which cannot be changed.
+    """
+
+    def __init__(self, name, default):
+        super(Fixed, self).__init__(name=name, default=default)
+        self.name = name
+        if not isinstance(default, (str, int, float, bool)):
+            raise ValueError(
+                "Fixed must be an str, int, float or bool, but found {}"
+                .format(default))
+        self._default = default
+
+    def random(self, seed=None):
+        return self._default
+
+    def __repr__(self):
+        return "Fixed(name: {}, value: {})".format(self.name, self.default)
+
+
+class Boolean(TunableVariable):
+    """
+    Choice between True and False.
+    """
+
+    def __init__(self, name, default=False):
+        super(Boolean, self).__init__(name=name, default=default)
+        if default not in {True, False}:
+            raise ValueError(
+                "default must be a Python boolean, but got {}".format(default))
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        return rng.choice((True, False))
+
+    def __repr__(self):
+        return 'Boolean(name: "{}", default: {})'.format(self.name,
+                                                         self.default)
+
+
+class Choice(TunableVariable):
+    def __init__(self, name, values, default=None):
+        super(Choice, self).__init__(name=name, default=default)
+
+        types = set(type(v) for v in values)
+        if len(types) > 1:
+            raise TypeError(
+                "Choice can contain only one type of value, but found values: {} with types: {}."
+                .format(str(values), str(types)))
+
+        if isinstance(values[0], str):
+            values = [str(v) for v in values]
+            if default is not None:
+                default = str(default)
+        elif isinstance(values[0], int):
+            values = [int(v) for v in values]
+            if default is not None:
+                default = int(default)
+        elif isinstance(values[0], float):
+            values = [float(v) for v in values]
+            if default is not None:
+                default = float(default)
+        elif isinstance(values[0], bool):
+            values = [bool(v) for v in values]
+            if default is not None:
+                default = bool(default)
+        else:
+            raise TypeError(
+                "Choice can only contain str, int, float, or boll, but found: {} "
+                .format(str(values)))
+        self.values = values
+
+        if default is not None and default not in values:
+            raise ValueError(
+                "The default value should be one of the choices {}, but found {}".
+                format(values, default))
+        self._default = default
+
+    @property
+    def default(self):
+        if self._default is None:
+            if None in self.values:
+                return None
+            return self.values[0]
+        return self._default
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        return rng.choice(self.values)
+
+    def get_state(self):
+        state = super(Choice, self).get_state()
+        state["values"] = self.values
+        return state
+
+    def __repr__(self):
+        return 'Choice(name: "{}", values: {}, default: {})'.format(
+            self.name, self.values, self.default)
+
+
+class IntRange(TunableVariable):
+    """
+    Integer range.
+    """
+
+    def __init__(self, name, start, stop, step=1, default=None, endpoint=False):
+        super(IntRange, self).__init__(name=name, default=default)
+        self.start = self._check_int(start)
+        self.stop = self._check_int(stop)
+        self.step = self._check_int(step)
+        self._default = default
+        self.endpoint = endpoint
+
+    @property
+    def default(self):
+        if self._default is not None:
+            return self._default
+        return self.start
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        value = (self.stop - self.start) * rng.random() + self.start
+        if self.step is not None:
+            if self.endpoint:
+                values = np.arange(self.start, self.stop + 1e-7, step=self.step)
+            else:
+                values = np.arange(self.start, self.stop, step=self.step)
+            closest_index = np.abs(values - value).argmin()
+            value = values[closest_index]
+        return int(value)
+
+    def get_state(self):
+        state = super(IntRange, self).get_state()
+        state["start"] = self.start
+        state["stop"] = self.stop
+        state["step"] = self.step
+        state["default"] = self._default
+        return state
+
+    def _check_int(self, val):
+        int_val = int(val)
+        if int_val != val:
+            raise ValueError("Expects val is an int, but found: {}.".format(
+                str(val)))
+        return int_val
+
+    def __repr__(self):
+        return "IntRange(name: {}, start: {}, stop: {}, step: {}, default: {})".format(
+            self.name, self.start, self.stop, self.step, self.default)
+
+
+class FloatRange(TunableVariable):
+    """
+    Float range.
+    """
+
+    def __init__(self,
+                 name,
+                 start,
+                 stop,
+                 step=None,
+                 default=None,
+                 endpoint=False):
+        super(FloatRange, self).__init__(name=name, default=default)
+        self.stop = float(stop)
+        self.start = float(start)
+        if step is not None:
+            self.step = float(step)
+        else:
+            self.step = None
+        self._default = default
+        self.endpoint = endpoint
+
+    @property
+    def default(self):
+        if self._default is not None:
+            return self._default
+        return self.start
+
+    def random(self, seed=None):
+        rng = np.random.default_rng(seed)
+        value = (self.stop - self.start) * rng.random() + self.start
+        if self.step is not None:
+            if self.endpoint:
+                values = np.arange(self.start, self.stop + 1e-7, step=self.step)
+            else:
+                values = np.arange(self.start, self.stop, step=self.step)
+            closest_index = np.abs(values - value).argmin()
+            value = values[closest_index]
+        return value
+
+    def get_state(self):
+        state = super(FloatRange, self).get_state()
+        state["start"] = self.start
+        state["stop"] = self.stop
+        state["step"] = self.step
+        state["endpoint"] = self.endpoint
+        return state
+
+    def __repr__(self):
+        return "FloatRange(name: {}, start: {}, stop: {}, step: {}, default: {}, endpoint: {})".format(
+            self.name, self.start, self.stop, self.step, self.default,
+            self.endpoint)
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 241eadcbace22..86c274cb45cc3 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -1271,7 +1271,6 @@ def get_all_distributed_main_program(serial_program_info, dist_context,
         used_dist_context._dist_op_context = DistributedOperatorContext()
         _, _, dist_startup_program, dist_main_program, _ = copied_parallelizer._get_dist_program(
             rank_id, used_dist_context)
-        # print("dist_main_program: ", dist_main_program)
         all_dist_main_program.append(dist_main_program)
 
     return all_dist_main_program
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index bf6556d21e9fc..fde3805914d80 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import os
+from datetime import timedelta
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.framework import Variable
 from ..fluid.framework import OpProtoHolder
@@ -73,6 +74,7 @@ class ReduceOp:
     MAX = 1
     MIN = 2
     PROD = 3
+    AVG = 4
 
 
 class Group():
@@ -80,11 +82,13 @@ class Group():
     The abstract representation of group.
     """
 
-    def __init__(self, rank, rank_num, id=0, ranks=[]):
+    def __init__(self, rank, rank_num, id=0, ranks=[], pg=None, name=None):
         self.rank = rank
         self.nranks = rank_num
         self.id = id
         self.ranks = ranks
+        self.pg = pg
+        self.name = name
 
     def is_member(self):
         if self.rank < 0:
@@ -99,11 +103,16 @@ def get_group_rank(self, rank):
         else:
             return -1
 
+    @property
+    def process_group(self):
+        return self.pg
+
     def __repr__(self):
         debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format(
             self.rank, self.nranks, self.id)
         debug_str += ", ".join(map(str, self.ranks))
-        debug_str += ". "
+        debug_str += "; name: "
+        debug_str += self.name if self.name else "None"
         return debug_str
 
 
@@ -121,6 +130,17 @@ def _get_global_env():
 # Dict[int, Group]
 _group_map = {}
 
+# group map by name : the map of all groups from their names
+# Dict[name, Group]
+_group_map_by_name = {}
+
+# Name of the default group for init_parallel_env
+_default_group_name = "_default_pg"
+
+_valid_backend_list = ['nccl', 'gloo', 'hccl']
+_default_store = None  # the default tcp store
+_default_backend = None
+
 
 def _get_group_map():
     global _group_map
@@ -135,10 +155,29 @@ def _get_global_group():
     return _get_group_map()[0]
 
 
+def _get_group_map_by_name():
+    global _group_map_by_name
+    assert _default_group_name in _group_map_by_name, (
+        "Call paddle.distributed.init_parallel_env first "
+        "to initialize the distributed environment.")
+    return _group_map_by_name
+
+
+def _get_default_group():
+    assert _default_group_name in _group_map_by_name, (
+        "Call paddle.distributed.init_parallel_env first "
+        "to initialize the distributed environment.")
+    return _get_group_map_by_name()[_default_group_name]
+
+
 def _new_ring_id():
     return len(_get_group_map()) + max(_get_global_env().nrings, 9)
 
 
+def _new_group_name_id():
+    return len(_get_group_map_by_name()) + max(_get_global_env().nrings, 9)
+
+
 def get_group(id=0):
     """
 
@@ -163,6 +202,194 @@ def get_group(id=0):
     return gm[id] if id in gm else None
 
 
+def _new_process_group_impl(backend, store, rank, world_size, group_name,
+                            pg_options):
+    if backend == "gloo":
+        gloo_store = core.GlooStore(store)
+
+    pg = None
+    if backend == "gloo":
+        pg = core.ProcessGroupGloo(gloo_store, rank, world_size)
+    elif backend == "nccl":
+        pg = core.ProcessGroupNCCL(store, rank, world_size)
+    elif backend == "hccl":
+        pg = core.ProcessGroupHCCL(store, rank, world_size)
+
+    return pg
+
+
+def _init_parallel_env(rank=None,
+                       world_size=None,
+                       backend="nccl",
+                       timeout=timedelta(0),
+                       pg_options=None):
+    """
+
+    Initializes the default distributed environment.
+    
+    Args:
+        rank (int, optional): the rank of the current process or device from 0 to world_size (exclusive).
+            If you launch your training with paddle.distributed.run or 
+            paddle.distributed.launch module, None can be given. Default: None.
+        world_size (int, optional): total number of processes or devices.
+            If you launch your training with paddle.distributed.run or 
+            paddle.distributed.launch module, None can be given. Default: None.
+        backend (str, optional): the name of the backend used to initialize
+            the distributed environment. The value can be one of 'nccl' for
+            GPU, 'gloo' for CPU or 'hccl' for NPU. Default: 'nccl'.
+        timeout (datetime.timedelta, optional): timeout used for operations of
+            the group. Default: datetime.timedelta(0) which means no timeout.
+        pg_options (dict, optional): options for the group. Default: None.
+
+    Returns:
+        Group: a group.
+
+    Examples:
+
+        .. code-block:: python
+
+            # filename: train.py
+            import paddle
+            paddle.distributed.init_parallel_env(0, 1)
+            
+            # how to start
+            # python paddle.distributed.run --gpus="0,1" train.py
+
+    """
+
+    global _group_map_by_name
+    global _default_group_name
+    assert _default_group_name not in _group_map_by_name, (
+        "The default distributed environment has been initialized.")
+
+    assert backend in _valid_backend_list, (
+        "Backend must be one of {}, but the given one is: {}".format(
+            _valid_backend_list, backend))
+    _default_backend = backend
+
+    assert isinstance(timeout, timedelta), (
+        "timeout must be of the type datetime.timedelta.")
+
+    if rank is None or world_size is None:
+        assert rank is None and world_size is None, (
+            "rank and world_size should be unset at the same time.")
+        trainer_id = os.getenv("PADDLE_TRAINER_ID", None)
+        trainer_num = os.getenv("PADDLE_TRAINERS_NUM", None)
+        if trainer_id is None or trainer_num is None:
+            warnings.warn("If rank and world_size are both None, please start "
+                          "your training with paddle.distributed.run or "
+                          "paddle.distributed.launch module. Otherwise, "
+                          "init_parallel_env will do nothing.")
+            return None
+        rank = int(trainer_id)
+        world_size = int(trainer_num)
+
+    assert rank >= 0 and world_size > rank and world_size > 1, (
+        "rank must be non-negative and world_size must be the "
+        "maximum rank plus one. Moreover, at least two processes are "
+        "required to create a process group.")
+
+    master_addr = os.getenv("MASTER_ADDR", None)
+    master_port = os.getenv("MASTER_PORT", None)
+    if not master_addr or not master_port:
+        endpoints = os.getenv("PADDLE_MASTER", None)
+        if endpoints is None:
+            endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", None)
+        if not endpoints:
+            raise ValueError(
+                "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' "
+                "must be specified, for example 'export MASTER_ADDR=127.0.0.1' "
+                "and 'export MASTER_ADDR=54612'. Or you can start your training"
+                "with paddle.distributed.run or "
+                "paddle.distributed.luanch module.")
+        if ',' in endpoints:
+            endpoints = endpoints.split(',')[0]
+        master_addr, master_port = endpoints.split(":")
+
+    master_port = int(master_port)
+
+    is_master = rank == 0
+    global _default_store
+    _default_store = core.TCPStore(master_addr, master_port, is_master,
+                                   world_size, timeout)
+
+    pg = _new_process_group_impl(backend, _default_store, rank, world_size,
+                                 _default_group_name, pg_options)
+    ranks = list(range(world_size))
+    group = Group(
+        rank, world_size, id=0, ranks=ranks, pg=pg, name=_default_group_name)
+
+    paddle.fluid.dygraph.parallel_helper._set_parallel_ctx(True)
+    _group_map_by_name[_default_group_name] = group
+    return group
+
+
+def _new_group(ranks=None,
+               backend=None,
+               group_name=None,
+               timeout=timedelta(0),
+               pg_options=None):
+    """
+    Create a new process group.
+
+    Args:
+        ranks (list, optional): list of ranks for the new group. If None is given, 
+            all processes is used. Default: None.
+        backend (str, optional): the name of the backend used to initialize
+            the distributed environment. Default: the one for init_parallel_env.
+        timeout (datetime.timedelta, optional): timeout used for operations of
+            the group. Default: datetime.timedelta(0).
+        pg_options (dict, optional): options for the group. Default: None.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            paddle.distributed.init_parallel_env(0, 1)
+            paddle.distributed.new_group([0, 1])
+
+            # how to start
+            # python paddle.distributed.run --gpus="0,1" train.py
+
+    """
+    global _default_group_name
+    if group_name is None:
+        group_name = _default_group_name + str(_new_group_name_id())
+    if group_name == _default_group_name:
+        raise ValueError("group_name must be specified and it cannot be '{}' "
+                         "which is used for the default process group created "
+                         "by init_parallel_env.".format(_default_group_name))
+    global_group = _get_default_group()
+    global_rank = global_group.rank
+    global_ranks = global_group.ranks
+    if ranks is None:
+        ranks = global_ranks
+    assert len(ranks) <= len(global_ranks), (
+        "Size of new group must be less than or "
+        "equal to that of the default global group.")
+    size = len(ranks)
+    assert size > 1, "A group must have at least two memebers."
+    ranks = sorted(ranks)
+    if global_rank in ranks:
+        rank = ranks.index(global_rank)
+        pg = _new_process_group_impl(backend, _default_store, rank, size,
+                                     group_name, pg_options)
+    else:
+        rank = -1
+        pg = None
+    group = Group(
+        rank,
+        size,
+        id=_new_group_name_id(),
+        ranks=ranks,
+        pg=pg,
+        name=group_name)
+    _group_map_by_name[group_name] = group
+
+    return group
+
+
 def barrier(group=None):
     """
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 89b59254e5b91..6a30276e02ba2 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -89,7 +89,7 @@ def _dygraph_clip(self, params_grads):
             global_norm_fp16 = paddle.cast(
                 global_norm_fp16, dtype=paddle.float32)
 
-        # global norm of non-distributed FP16 params_and_grads for slice parameter
+        # global norm of non-distributed FP16 params_and_grads for unslice parameter
         if len(unslice_params_fp16) == 0:
             global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
         else:
@@ -104,21 +104,20 @@ def _dygraph_clip(self, params_grads):
                 [0.], dtype=paddle.float32)
         global_norm_fp32 = layers.reduce_sum(global_norm_fp32)
 
-        # global norm of non-distributed FP32 params_and_grads for slice parameter
+        # global norm of non-distributed FP32 params_and_grads for unslice parameter
         global_unslice_fp32 = layers.concat(unslice_params_fp32) if len(
             unslice_params_fp32) != 0 else paddle.to_tensor(
                 [0.], dtype=paddle.float32)
         global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32)
         global_unslice_var = global_unslice_fp16 + global_unslice_fp32
 
-        global_norm_var = global_norm_fp16 + global_norm_fp32
+        global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var
 
         # add all reduce to get global norm of distributed params_and_grads
         dev_id = int(self._device.split(":")[1])
         with device_guard(dev_id, "gpu"):
             paddle.distributed.all_reduce(global_norm_var, group=self._group)
 
-        global_norm_var += global_unslice_var
         global_norm_var = layers.sqrt(global_norm_var)
         max_global_norm = layers.fill_constant(
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch/__init__.py
similarity index 80%
rename from python/paddle/distributed/launch.py
rename to python/paddle/distributed/launch/__init__.py
index e02a439025b77..4ce89fa36b06b 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch/__init__.py
@@ -1,18 +1,15 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.distributed.fleet import launch
-launch.launch()
-
 __all__ = []
diff --git a/python/paddle/distributed/launch/__main__.py b/python/paddle/distributed/launch/__main__.py
new file mode 100644
index 0000000000000..42f844ca71774
--- /dev/null
+++ b/python/paddle/distributed/launch/__main__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .main import launch
+
+launch()
diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
new file mode 100644
index 0000000000000..510f49d8246f1
--- /dev/null
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.launch import plugins
+
+from .node import Node
+from .status import Status
+from .args_envs import parse_args, fetch_envs, env_args_mapping
+
+import logging
+
+
+class Context(object):
+    def __init__(self, enable_plugin=True):
+        self.args, self.unknown_args = parse_args()
+        self.envs = fetch_envs()
+        self.logger = self.get_logger()
+
+        self.node = Node()
+        self.status = Status()
+
+        self.set_env_in_args()
+
+        # design for event queue, later
+        self.events = []
+
+        if enable_plugin:
+            self._enable_plugin()
+
+    def is_legacy_mode(self):
+        if self.args.legacy:
+            return True
+
+        if len(self.unknown_args) > 0:
+            self.logger.warning("Compatible mode enable with args {}".format(
+                self.unknown_args))
+            return True
+
+        legacy_env_list = [
+            'DISTRIBUTED_TRAINER_ENDPOINTS',
+            'PADDLE_ELASTIC_JOB_ID',
+            'PADDLE_DISTRI_BACKEND',
+            'FLAGS_START_PORT',
+        ]
+
+        for env in legacy_env_list:
+            if env in self.envs:
+                self.logger.warning(
+                    "ENV {} is deprecated, legacy launch enable".format(env))
+                return True
+
+        if self.args.master:
+            return False
+
+        return False
+
+    def get_envs(self):
+        return self.envs.copy()
+
+    def _enable_plugin(self):
+        for pl in plugins.enabled_plugins:
+            pl(self)
+
+    def get_logger(self, level=logging.INFO):
+        logger = logging.getLogger("LAUNCH")
+        logger.setLevel(self.args.log_level.upper() or level)
+        formatter = logging.Formatter(
+            fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+        return logger
+
+    def continous_log(self) -> bool:
+        if self.args.log_level.upper() in ['DEBUG', 'ERROR']:
+            return True
+        else:
+            return False
+
+    def set_env_in_args(self):
+        for k, v in env_args_mapping.items():
+            if k in self.envs:
+                setattr(self.args, v, self.envs[k])
diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py
new file mode 100644
index 0000000000000..b624281e44db3
--- /dev/null
+++ b/python/paddle/distributed/launch/context/args_envs.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser, REMAINDER
+
+env_args_mapping = {
+    'POD_IP': 'host',
+    'PADDLE_MASTER': 'master',
+    'PADDLE_DEVICES': 'devices',
+    'PADDLE_NNODES': 'nnodes',
+    'PADDLE_RUN_MODE': 'run_mode',
+    'PADDLE_LOG_LEVEL': 'log_level',
+    'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
+    'PADDLE_JOB_ID': 'job_id',
+    'PADDLE_RANK': 'rank',
+    'PADDLE_LOG_DIR': 'log_dir',
+    'PADDLE_MAX_RESTART': 'max_restart',
+    'PADDLE_ELASTIC_LEVEL': 'elastic_level',
+    'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout',
+    'PADDLE_SERVER_NUM': 'server_num',
+    'PADDLE_TRAINER_NUM': 'trainer_num',
+    'PADDLE_SERVERS_ENDPOINTS': 'servers',
+    'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
+    'PADDLE_GLOO_PORT': 'gloo_port',
+    'PADDLE_WITH_GLOO': 'with_gloo',
+}
+
+
+def fetch_envs():
+    os.environ.pop('http_proxy', None)
+    os.environ.pop('https_proxy', None)
+
+    return os.environ.copy()
+
+
+def parse_args():
+    parser = ArgumentParser()
+
+    base_group = parser.add_argument_group("Base Parameters")
+
+    base_group.add_argument(
+        "--master",
+        type=str,
+        default=None,
+        help="the master/rendezvous server, ip:port")
+
+    base_group.add_argument(
+        "--legacy", type=bool, default=False, help="use legacy launch")
+
+    base_group.add_argument(
+        "--rank", type=int, default=-1, help="the node rank")
+
+    base_group.add_argument(
+        "--log_level", type=str, default="INFO", help="log level. Default INFO")
+
+    base_group.add_argument(
+        "--nnodes",
+        type=str,
+        default="1",
+        help="the number of nodes, i.e. pod/node number")
+
+    base_group.add_argument(
+        "--nproc_per_node",
+        type=int,
+        default=None,
+        help="the number of processes in a pod")
+
+    base_group.add_argument(
+        "--log_dir",
+        type=str,
+        default="log",
+        help="the path for each process's log. Default ./log")
+    base_group.add_argument(
+        "--run_mode",
+        type=str,
+        default="collective",
+        help="run mode of the job, collective/ps/ps-heter")
+
+    base_group.add_argument(
+        "--job_id",
+        type=str,
+        default="default",
+        help="unique id of the job. Default default")
+
+    base_group.add_argument(
+        "--devices",
+        type=str,
+        default=None,
+        help="accelerate devices. as --gpus,npus,xps")
+
+    base_group.add_argument("--host", type=str, default=None, help="host ip")
+
+    base_group.add_argument(
+        "training_script",
+        type=str,
+        help="the full path of py script,"
+        "followed by arguments for the "
+        "training script")
+
+    base_group.add_argument('training_script_args', nargs=REMAINDER)
+
+    ps_group = parser.add_argument_group("Parameter-Server Parameters")
+    # for parameter server
+    ps_group.add_argument(
+        "--servers", type=str, default='', help="servers endpoints full list")
+    ps_group.add_argument(
+        "--trainers", type=str, default='', help="trainers endpoints full list")
+
+    ps_group.add_argument(
+        "--trainer_num", type=int, default=None, help="number of trainers")
+    ps_group.add_argument(
+        "--server_num", type=int, default=None, help="number of servers")
+    ps_group.add_argument(
+        "--gloo_port", type=int, default=6767, help="gloo http port")
+    ps_group.add_argument(
+        "--with_gloo", type=str, default="0", help="use gloo or not")
+
+    # parameter elastic mode
+    elastic_group = parser.add_argument_group("Elastic Parameters")
+    elastic_group.add_argument(
+        "--max_restart",
+        type=int,
+        default=3,
+        help="the times can restart. Default 3")
+
+    elastic_group.add_argument(
+        "--elastic_level",
+        type=int,
+        default=-1,
+        help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
+    )
+
+    elastic_group.add_argument(
+        "--elastic_timeout",
+        type=int,
+        default=30,
+        help="seconds to wait before elastic job begin to train")
+
+    return parser.parse_known_args()
diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py
new file mode 100644
index 0000000000000..9163e7abd9183
--- /dev/null
+++ b/python/paddle/distributed/launch/context/device.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+class DeviceType:
+    CPU = 'cpu'
+    GPU = 'gpu'
+    XPU = 'xpu'
+    NPU = 'npu'
+    MLU = 'mlu'
+
+
+class Device(object):
+    def __init__(self, dtype=None, memory="", labels=""):
+        self._dtype = dtype
+        self._memory = memory
+        self._labels = labels
+
+    def __str__(self):
+        return ",".join(self._labels)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def count(self):
+        return len(self._labels) or 1
+
+    @property
+    def memory(self):
+        return self._memory
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @labels.setter
+    def labels(self, lbs):
+        if isinstance(lbs, str):
+            self._labels = lbs.split(',')
+        elif isinstance(lbs, list):
+            self._labels = lbs
+        else:
+            self._labels = []
+
+    def get_selected_flag_key(self):
+        if self._dtype == DeviceType.CPU:
+            return 'FLAGS_selected_cpus'
+        if self._dtype == DeviceType.GPU:
+            return 'FLAGS_selected_gpus'
+        if self._dtype == DeviceType.NPU:
+            return 'FLAGS_selected_npus'
+        if self._dtype == DeviceType.XPU:
+            return 'FLAGS_selected_xpus'
+        if self._dtype == DeviceType.MLU:
+            return 'FLAGS_selected_mlus'
+        return 'FLAGS_selected_devices'
+
+    def get_selected_flag_label(self, idx):
+        if idx < len(self._labels):
+            return self._labels[idx]
+        else:
+            return '0'
+
+    def selected_flags(self, idx=None):
+        if idx is None:
+            return {self.get_selected_flag_key(): ','.join(self._labels)}
+        else:
+            return {
+                self.get_selected_flag_key(): self.get_selected_flag_label(idx)
+            }
+
+    @classmethod
+    def parse_device(self):
+        dev = Device()
+        visible_devices = None
+        if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.GPU
+            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
+                "NVIDIA_VISIBLE_DEVICES")
+        elif 'XPU_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.XPU
+            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
+        elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.NPU
+            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+        elif 'MLU_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.MLU
+            visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
+
+        if visible_devices is not None and visible_devices != 'all':
+            dev._labels = visible_devices.split(',')
+        else:
+            return self.detect_device()
+
+        return dev
+
+    @classmethod
+    def detect_device(self):
+        import paddle.fluid as fluid
+
+        dev = Device()
+        num = 0
+        visible_devices = None
+        if fluid.core.is_compiled_with_cuda():
+            dev._dtype = DeviceType.GPU
+            num = fluid.core.get_cuda_device_count()
+            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
+                "NVIDIA_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_xpu():
+            dev._dtype = DeviceType.XPU
+            num = fluid.core.get_xpu_device_count()
+            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_npu():
+            dev._dtype = DeviceType.NPU
+            num = fluid.core.get_npu_device_count()
+            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_mlu():
+            dev._dtype = DeviceType.MLU
+            num = fluid.core.get_mlu_device_count()
+            visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
+
+        if num == 0:
+            dev._dtype = DeviceType.CPU
+        elif visible_devices is None or visible_devices == "all":
+            dev._labels = [str(x) for x in range(0, num)]
+        else:
+            dev._labels = visible_devices.split(',')
+
+        return dev
+
+
+if __name__ == '__main__':
+    d = Device.parse_device()
+    print(d.get_selected_flag())
diff --git a/python/paddle/distributed/launch/context/event.py b/python/paddle/distributed/launch/context/event.py
new file mode 100644
index 0000000000000..23e8e7a501400
--- /dev/null
+++ b/python/paddle/distributed/launch/context/event.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Event(object):
+    def __init__(self, kind="status", message="", fatal=False):
+        self.kind = kind
+        self.message = message
+        self.fatal = fatal
diff --git a/python/paddle/distributed/launch/context/node.py b/python/paddle/distributed/launch/context/node.py
new file mode 100644
index 0000000000000..1ece4db0fbbee
--- /dev/null
+++ b/python/paddle/distributed/launch/context/node.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .device import Device
+
+import socket
+import struct
+from contextlib import closing
+
+
+class Node(object):
+    def __init__(self):
+        # self.device = Device.detect_device()
+        self.device = Device.parse_device()
+        self.ip = self.get_host_ip()
+        self.free_ports = []
+
+    def get_host_ip(self):
+        try:
+            self.hostname = socket.gethostname()
+            self.ip = socket.gethostbyname(socket.getfqdn(self.hostname))
+            return self.ip
+        except:
+            return '127.0.0.1'
+
+    def get_free_ports(self, n=1):
+        free_ports = [self.get_free_port() for i in range(n)]
+        self.free_ports += free_ports
+        return free_ports
+
+    def get_ports_occupied(self):
+        return self.free_ports
+
+    @classmethod
+    def get_free_port(self):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
+                         struct.pack('ii', 1, 0))
+            s.bind(('', 0))
+            return s.getsockname()[1]
+
+    @classmethod
+    def is_server_ready(self, ip, port):
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+            #sock.settimeout(0.01)
+            #sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            if hasattr(socket, 'SO_REUSEPORT'):
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
+            result = sock.connect_ex((ip, int(port)))
+            if result == 0:
+                return True
+            else:
+                return False
diff --git a/python/paddle/distributed/launch/context/resource.py b/python/paddle/distributed/launch/context/resource.py
new file mode 100644
index 0000000000000..faffed704c1f0
--- /dev/null
+++ b/python/paddle/distributed/launch/context/resource.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Resource(object):
+    def __init__(self):
+        self.devices = []
diff --git a/python/paddle/distributed/launch/context/status.py b/python/paddle/distributed/launch/context/status.py
new file mode 100644
index 0000000000000..cfbf3623ec22e
--- /dev/null
+++ b/python/paddle/distributed/launch/context/status.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Status(object):
+    UNINIT = "uninit"
+    READY = "ready"
+    RUNNING = "running"
+    FAILED = "failed"
+    TERMINATING = "terminating"
+    RESTARTING = "restarting"
+    UNKNOWN = "unknown"
+    COMPLETED = "completed"
+    DONE = "done"  # should exit whatever status
+
+    def __init__(self):
+        self._current_status = None
+
+    def current(self):
+        return self._current_status
+
+    def is_running(self):
+        return self._current_status == self.RUNNING
+
+    def is_restarting(self):
+        return self._current_status == self.RESTARTING
+
+    def is_done(self):
+        if self._current_status in [self.DONE, self.COMPLETED, self.FAILED]:
+            return True
+        else:
+            return False
+
+    def run(self):
+        self._current_status = self.RUNNING
+
+    def fail(self):
+        self._current_status = self.FAILED
+
+    def complete(self):
+        self._current_status = self.COMPLETED
+
+    def restart(self):
+        self._current_status = self.RESTARTING
+
+    def done(self):
+        self._current_status = self.DONE
diff --git a/python/paddle/distributed/launch/controllers/__init__.py b/python/paddle/distributed/launch/controllers/__init__.py
new file mode 100644
index 0000000000000..706131300f0d8
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = []
+
+from .collective import CollectiveController
+from .collective import CollectiveElasticController
+from .ps import PSController
+
+# the order is extremely important
+_controllers = [
+    CollectiveElasticController,
+    PSController,
+    CollectiveController,
+]
+
+
+def init(ctx):
+    for c in _controllers:
+        if c.enable(ctx):
+            return c(ctx)
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
new file mode 100644
index 0000000000000..0a6c1c4002abb
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .controller import Controller
+
+import json
+import os
+import six
+import time
+
+
+class CollectiveController(Controller):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx:
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            return True
+        else:
+            return False
+
+    def build_pod(self):
+        self.pod.replicas = self.pod_replicas()
+
+        # rank will be reset when restart
+        self.pod.rank = self.ctx.args.rank
+
+        port = self.ctx.node.get_free_port()
+
+        # compatible
+        endpoints = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(self.pod.replicas)
+        ]
+
+        data = json.dumps({
+            'name': self.pod.name,
+            'rank': self.pod.rank,
+            'replicas': self.pod.replicas,
+            'dtype': self.ctx.node.device.dtype,
+            'candidate': '{}:{}'.format(self.ctx.node.ip, port),
+            'endpoints': ",".join(endpoints),
+        })
+
+        peer_list, rank = self.master.sync_peers(
+            '/{}/info'.format(self.job.id), self.pod.name, data,
+            self.job.replicas, self.pod.rank)
+        self.pod.rank = rank
+
+        if len(peer_list) < 1:
+            return False
+
+        peer_list = [json.loads(i) for i in peer_list]
+
+        self.ctx.logger.debug("sync peers done {}".format(peer_list))
+        self.save_pod_log(peer_list)
+
+        global_size = sum([i['replicas'] for i in peer_list])
+        rank_offset = sum([i['replicas'] for i in peer_list[:rank]])
+        '''
+        The new designed collective need nothing but a master endpoint
+        '''
+        collective_master = peer_list[0]['candidate']
+
+        job_endpoints = [i['endpoints'] for i in peer_list]
+
+        self.pod.reset()
+        for i in range(self.pod.replicas):
+            e = {
+                "PADDLE_MASTER": collective_master,
+                "PADDLE_GLOBAL_SIZE": "{}".format(global_size),
+                "PADDLE_LOCAL_SIZE": "{}".format(self.pod.replicas),
+                "PADDLE_GLOBAL_RANK": "{}".format(i + rank_offset),
+                "PADDLE_LOCAL_RANK": "{}".format(i),
+                ## compatible env
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(job_endpoints),
+                "PADDLE_CURRENT_ENDPOINT": endpoints[i],
+                "PADDLE_TRAINER_ID": "{}".format(i + rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(global_size),
+                "PADDLE_RANK_IN_NODE": str(i),
+            }
+            if self.pod.replicas == 1:
+                e.update(self.ctx.node.device.selected_flags())
+            else:
+                e.update(self.ctx.node.device.selected_flags(i))
+            self.add_container(envs=e, log_tag=i)
+
+        return True
+
+
+class CollectiveElasticController(CollectiveController):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx.args.master and ctx.args.master.startswith("etcd://"):
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            return True
+        else:
+            return False
+
+    def register(self):
+        if self.job.id == 'default':
+            self.ctx.logger.warning(
+                'Using default job name may cause conflict, add --job_id in args'
+            )
+
+        self.master.register_heartbeat(self.job.id, self.pod.name)
+
+    def run(self):
+
+        timeout = self.ctx.args.elastic_timeout if self.job.elastic else self.ctx.args.elastic_timeout * 10
+        self.register()
+
+        while self.pod.restart <= self.ctx.args.max_restart:
+
+            self.build_job()
+
+            self.ctx.logger.info("Waiting peer ready...")
+
+            ok, replicas = self.master.wait_peer_ready(
+                self.job.replicas_min, self.job.replicas_max, timeout)
+            if ok:
+                self.job.replicas = replicas
+            else:
+                self.ctx.logger.warnning("peer not ready {}".format(self.job))
+                break
+
+            self.ctx.logger.debug("Run {}".format(self.job))
+
+            if not self.build_pod():
+                continue
+
+            self.master.set_status(self.ctx.status.RUNNING)
+
+            self.deploy_pod()
+
+            if self.watch():
+                break
+
+        self.ctx.logger.debug("Job done {}".format(self.job))
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
new file mode 100644
index 0000000000000..08345a2a1f76b
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import signal
+
+from paddle.distributed.launch.job.job import Job
+from paddle.distributed.launch.job.pod import Pod
+from paddle.distributed.launch.job.container import Container
+
+from .master import Master
+
+import time
+
+
+class ControleMode:
+    COLLECTIVE = "collective"
+    PS = "ps"
+
+
+class ControllerBase(object):
+    def __init__(self, ctx):
+        signal.signal(signal.SIGTERM, self.signal_handler)
+        signal.signal(signal.SIGABRT, self.signal_handler)
+        signal.signal(signal.SIGINT, self.signal_handler)
+
+        self.ctx = ctx
+        self.master = Master.factory(self.ctx)
+
+        self.job = Job(nnodes=self.ctx.args.nnodes,
+                       mode=self.ctx.args.run_mode,
+                       jid=self.ctx.args.job_id)
+        self.pod = Pod()
+
+        self.join_server = None
+
+    def deploy_pod(self):
+
+        assert len(self.pod.containers) > 0, "No container in the pod"
+
+        self.ctx.logger.info("Run {}".format(self.pod))
+        self.ctx.logger.debug(self.pod.containers[0])
+
+        self.ctx.status.run()
+        self.pod.deploy()
+
+    def run(self):
+        self.build_job()
+        self.build_pod()
+
+        self.deploy_pod()
+
+        self.watch()
+
+    def watch(self) -> bool:
+        '''
+        watch self and peer status, return true to exit
+        '''
+        #TODO(kuizhiqing) unify ctx.status and master status
+
+        self.ctx.logger.info("Watching {}".format(self.pod))
+
+        while not self.ctx.status.is_done():
+            status = self.pod.watch(timeout=2)
+
+            if self.ctx.continous_log():
+                self.pod.logs()
+
+            # completed
+            if status == self.ctx.status.COMPLETED:
+                self.ctx.status.complete()
+
+                self.master.set_status(status)
+
+                self.ctx.logger.info("Pod {}".format(status))
+                return True
+
+            # self failure
+            elif status == self.ctx.status.FAILED:
+                self.ctx.status.fail()
+
+                self.master.set_status(status)
+                self.master.restart_peer()
+
+                fc = self.pod.failed_container()
+                self.ctx.logger.info("Pod {}".format(status))
+                self.ctx.logger.error("Container failed !!!\n{}".format(fc[0]))
+                fc[0].tail()
+                self.pod.stop()
+
+                if self.ctx.args.elastic_level <= 0:
+                    return True
+                else:
+                    return False
+
+            # peer failure
+            if self.ctx.status.is_restarting() and self.master.get_status(
+            ) != self.ctx.status.COMPLETED:
+                self.pod.stop()
+                return False
+
+    def stop(self, sigint=None):
+        self.ctx.logger.debug("Controller stop")
+        self.master.stop()
+        self.pod.stop(sigint)
+
+    def finalize(self):
+        self.pod.join()
+        self.master.stop()
+
+        self.ctx.logger.info("Exit code {}".format(self.pod.exit_code))
+        sys.exit(self.pod.exit_code)
+
+    def signal_handler(self, sigint, frame):
+        self.ctx.logger.info("Terminating with signal {}".format(sigint))
+
+        if hasattr(self, 'sigint'):
+            time.sleep(5)
+            sys.exit(sigint)
+
+        self.sigint = sigint
+        self.ctx.status.done()
+        self.stop(sigint)
+        time.sleep(1)
+        self.ctx.logger.debug("Exit with signal {}".format(sigint))
+        sys.exit(sigint)
+
+
+class Controller(ControllerBase):
+    '''
+    Controller API for customization
+    '''
+
+    def build_job(self):
+        '''
+        build job fill the job info.
+        '''
+        self.ctx.logger.info(self.job)
+
+    def build_pod(self) -> bool:
+        '''
+        build pod includes creating containers etc.
+
+        Return True if succeed
+        '''
+        raise NotImplementedError
+
+    def _get_entrypoint(self):
+        entrypoint = [sys.executable, "-u", self.ctx.args.training_script]
+        entrypoint.extend(self.ctx.args.training_script_args)
+        return entrypoint
+
+    def _get_out_err_file(self, out=None, err=None):
+        if out and self.ctx.args.log_dir != "":
+            out = os.path.join(self.ctx.args.log_dir, out)
+        if err and self.ctx.args.log_dir != "":
+            err = os.path.join(self.ctx.args.log_dir, err)
+        return out, (err or out)
+
+    def new_container(self,
+                      entrypoint=None,
+                      envs={},
+                      use_ctx_env=True,
+                      out=None,
+                      err=None):
+        c = Container(
+            entrypoint=(entrypoint or self._get_entrypoint()),
+            env=(self.ctx.get_envs() if use_ctx_env else {}), )
+        c.outfile, c.errfile = self._get_out_err_file(out, err)
+        c.update_env(envs)
+        return c
+
+    def add_container(self,
+                      container=None,
+                      entrypoint=None,
+                      envs={},
+                      log_tag=None,
+                      is_init=False):
+        if not is_init and log_tag is not None:
+            log_file = "{}.{}.{}.log".format(self.job.id, self.pod.name,
+                                             log_tag)
+        else:
+            log_file = None
+
+        if not container:
+            container = self.new_container(
+                entrypoint=entrypoint, envs=envs, out=log_file, err=log_file)
+
+        if is_init:
+            self.pod.add_init_container(container)
+        else:
+            self.pod.add_container(container)
+
+    def pod_replicas(self):
+        '''
+        how many process/container should be run in pod
+        '''
+
+        if self.ctx.args.nproc_per_node:
+            return int(self.ctx.args.nproc_per_node)
+        else:
+            return self.ctx.node.device.count
+
+    def save_pod_log(self, info):
+        '''
+        save_pod_log append *info* to the log file of pod.name
+        '''
+        if not self.ctx.args.log_dir:
+            return
+
+        f = os.path.join(self.ctx.args.log_dir,
+                         '{}.{}.log'.format(self.job.id, self.pod.name))
+        try:
+            os.makedirs(os.path.dirname(f), exist_ok=True)
+            with open(f, 'a+') as fd:
+                fd.write(str(info))
+        except Exception as e:
+            self.ctx.logger.error("save log failed because {}".format(e))
diff --git a/python/paddle/distributed/launch/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py
new file mode 100644
index 0000000000000..43eda4cdffa24
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/master.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.launch.utils.kv_client import KVClient
+from paddle.distributed.launch.utils.kv_server import KVServer
+
+import time
+import sys
+import six
+import threading
+import copy
+import random
+
+ETCD_PROTOCAL = 'etcd://'
+
+
+class Master(object):
+    '''
+    Master is a distributed store design to exchange info among nodes
+    '''
+
+    MAIN = "main"
+    STANDBY = "standby"
+    PATICIPANT = "participant"
+
+    def __init__(self, ctx):
+        self.ctx = ctx
+        self.server = None
+        self.initialized = False
+        self.endpoint = None
+
+    def stop(self):
+        raise NotImplementedError
+
+    def set_status(self, status):
+        pass
+
+    def get_status(self):
+        return None
+
+    def restart_peer(self):
+        pass
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+        raise NotImplementedError
+
+    @classmethod
+    def factory(cls, ctx):
+        if ctx.args.master and ctx.args.master.startswith(ETCD_PROTOCAL):
+            return ETCDMaster(ctx)
+        else:
+            return HTTPMaster(ctx)
+
+
+class HTTPMaster(Master):
+    def lazy_init(self):
+        if self.initialized:
+            return
+
+        self.role = Master.PATICIPANT
+
+        if self.ctx.args.master:
+            self.endpoint = self.ctx.args.master
+            ip, port = self.endpoint.split(':')
+            if ip in ['127.0.0.1', self.ctx.node.ip]:
+                time.sleep(2 * random.random())
+                while not self.ctx.node.is_server_ready(ip, int(port)):
+                    try:
+                        self.server = KVServer(int(port))
+                        self.role = Master.MAIN
+                        break
+                    except Exception as e:
+                        self.ctx.logger.warning("start master failed {}".format(
+                            e))
+                        time.sleep(0.1)
+                        continue
+        else:
+            port = self.ctx.node.get_free_port()
+            self.endpoint = "{}:{}".format(self.ctx.node.ip, port)
+            self.server = KVServer(port)
+            self.role = Master.MAIN
+
+            print("Copy the following command to other nodes to run.")
+            cmd = [
+                sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"
+            ]
+            cmd.extend(["--master", self.endpoint])
+            cmd.extend(sys.argv[1:])
+            print("-" * 80)
+            print(" ".join(cmd))
+            print("-" * 80)
+
+            if self.ctx.args.rank >= 0:
+                self.ctx.logger.warning(
+                    "--rank set in the command may not compatible in auto mode")
+
+        if '127.0.0.1' in self.endpoint:
+            self.endpoint = self.endpoint.replace('127.0.0.1', self.ctx.node.ip)
+        self.client = KVClient(self.endpoint)
+
+        self.initialized = True
+
+        self._start_server()
+
+    def _start_server(self):
+        if self.server and not self.server.started:
+            self.server.start()
+            self.ctx.logger.debug("KV server start at {}".format(self.endpoint))
+
+    def _stop_server(self):
+        if self.server and not self.server.stopped:
+            self.server.stop()
+            self.ctx.logger.debug("KV server stopped")
+
+    def stop(self):
+        self._stop_server()
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+
+        if size < 2:
+            return [value], 0
+
+        self.ctx.logger.info("Waiting peer start...")
+
+        self.lazy_init()
+
+        while not self.ctx.status.is_done():
+            if self.client.wait_server_ready(timeout=5):
+                break
+            else:
+                self.ctx.logger.warning("master not ready")
+                time.sleep(0.1)
+
+        # 'aaaaaa' make sure main pod (master server) as rank 0
+        ky = 'aaaaaa' if rank < 0 and self.role == Master.MAIN else key
+        k = "{}/{}/{}".format(prefix, ky, rank)
+
+        while not self.ctx.status.is_done():
+            if not self.client.put(k, value):
+                self.ctx.logger.warning("put value failed")
+                time.sleep(0.1)
+                continue
+
+            rjson = self.client.get_prefix(prefix)
+            self.ctx.logger.debug("sync peers {}".format(rjson))
+            if rjson and len(rjson) == size:
+                if rank < 0:
+                    keys = list(rjson.keys())
+                    keys.sort()
+                    ret = [rjson[k] for k in keys]
+                    idx = ret.index(value)
+                    return ret, idx
+                else:
+                    ret = [None] * size
+                    for k, v in rjson.items():
+                        ret[int(k.split('/')[-1])] = v
+                    return ret, rank
+            else:
+                time.sleep(0.5)
+        return [], 0
+
+
+class ETCDMaster(Master):
+    def __init__(self, ctx):
+        super().__init__(ctx)
+
+        if self.ctx.args.master:
+            # etcd://localhost:2379
+            self.endpoint = self.ctx.args.master.strip("etcd://")
+
+        import etcd3
+
+        host, port = self.endpoint.split(':')
+        self.client = etcd3.client(host=host, port=port)
+
+    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+        '''
+        sync_peers gather all value for key under scope prefix
+        result always be sorted either by rank or alphabet of pod.name
+        '''
+
+        if size < 2:
+            return [value], 0
+
+        self.ctx.logger.info("Waiting peer start...")
+
+        path = "{}/{}/{}".format(prefix, key, rank)
+
+        self.client.delete_prefix(prefix)
+
+        self.ctx.logger.debug("sync path {} value {}".format(path, value))
+
+        while not self.ctx.status.is_done():
+            self.client.put(path, six.b(value))
+
+            result = [i for i in self.client.get_prefix(prefix)]
+            result = copy.deepcopy(result)
+            self.ctx.logger.debug("sync peers {}".format(result))
+
+            if len(result) == size:
+                if rank < 0:
+                    keys = [six.ensure_str(i[1].key) for i in result]
+                    sorted_keys = [six.ensure_str(i[1].key) for i in result]
+                    sorted_keys.sort()
+                    values = [six.ensure_str(i[0]) for i in result]
+                    ret = [values[keys.index(k)] for k in sorted_keys]
+                    idx = ret.index(value)
+                    return ret, idx
+                else:
+                    ret = [None] * size
+                    for v, k in result:
+                        ii = int(six.ensure_str(k.key).split('/')[-1])
+                        if ii < 0:
+                            self.ctx.logger.error(
+                                "rank {} error in sync".format(ii))
+                        ret[ii] = six.ensure_str(v)
+                    return ret, rank
+            else:
+                time.sleep(0.5)
+
+    def register_heartbeat(self, job_id, pod_id, ttl=10):
+        if hasattr(self, 'heartbeat_prefix'):
+            self.ctx.logger.warning("Heartbeat already done")
+            return
+
+        self.job_prefix = '/paddle/{}'.format(job_id)
+        self.heartbeat_prefix = '{}/heartbeat'.format(self.job_prefix)
+
+        lease = self.client.lease(ttl)
+
+        #self.client.delete_prefix(self.job_prefix)
+
+        beat_path = "{}/{}".format(self.heartbeat_prefix, pod_id)
+        self.client.put(beat_path, six.b(pod_id), lease=lease)
+
+        def _beat_watch(event):
+            self.ctx.status.restart()
+
+        beat_watch = self.client.add_watch_prefix_callback(
+            self.heartbeat_prefix, _beat_watch)
+
+        def _heartbeat():
+            while not self.ctx.status.is_done():
+                try:
+                    lease.refresh()
+                    if pod_id not in self.fetch_peer_alive():
+                        self.client.put(beat_path, six.b(pod_id), lease=lease)
+                        self.ctx.logger.debug("Heartbeat register again")
+                except Exception as e:
+                    self.ctx.logger.error("Heartbeat error {}".format(e))
+                time.sleep(ttl / 2)
+            self.ctx.logger.debug("Heartbeat done")
+            self.client.cancel_watch(beat_watch)
+
+        self.beat_thread = threading.Thread(
+            name='heartbeat', target=_heartbeat, daemon=True)
+        self.beat_thread.start()
+
+    def fetch_peer_alive(self):
+        peer_alive = [
+            six.ensure_str(i[0])
+            for i in self.client.get_prefix(self.heartbeat_prefix)
+        ]
+        self.ctx.logger.debug("peer alive {}".format(peer_alive))
+        return peer_alive
+
+    def wait_peer_ready(self, replicas_min, replicas_max, timeout):
+        end = time.time() + timeout
+        while not self.ctx.status.is_done() and time.time() < end:
+            if len(self.fetch_peer_alive()) == replicas_max:
+                return (True, replicas_max)
+            else:
+                time.sleep(0.5)
+
+        np = len(self.fetch_peer_alive())
+        if np >= replicas_min and np <= replicas_max:
+            return (True, np)
+        else:
+            return (False, np)
+
+    def restart_peer(self):
+        self.client.delete_prefix(self.heartbeat_prefix)
+
+    def set_status(self, status):
+        assert self.client.put(
+            self.job_prefix, six.b(status),
+            lease=self.client.lease(600)), "set status failed {}".format(status)
+
+    def get_status(self):
+        return six.ensure_str(self.client.get(self.job_prefix)[0] or '')
+
+    def stop(self):
+        if hasattr(self, 'beat_thread'):
+            self.ctx.status.done()
+            # TODO(kuizhiqing) thread should exit
+            #self.beat_thread.join()
diff --git a/python/paddle/distributed/launch/controllers/ps.py b/python/paddle/distributed/launch/controllers/ps.py
new file mode 100644
index 0000000000000..6504f1240ee09
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/ps.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .controller import Controller, ControleMode
+
+import json
+import os, shutil
+
+
+class PSController(Controller):
+    @classmethod
+    def enable(cls, ctx):
+        if ctx.args.run_mode == ControleMode.PS or ctx.args.server_num or len(
+                ctx.args.servers) > 0 or ctx.args.trainer_num or len(
+                    ctx.args.trainers) > 0:
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            ctx.args.run_mode = ControleMode.PS
+            return True
+        else:
+            return False
+
+    def build_pod(self):
+        if self.ctx.args.servers and self.ctx.args.trainers:
+            self._build_pod_with_args()
+        else:
+            self._build_pod_with_master()
+
+    def _build_pod_with_args(self):
+        if '127.0.0.1' in self.ctx.args.servers:
+            host = '127.0.0.1'
+        else:
+            host = self.ctx.node.ip
+
+        server_endpoints = [s for s in self.ctx.args.servers.split(",")]
+        trainer_endpoints = [s for s in self.ctx.args.trainers.split(",")]
+        servers = [
+            s for s in self.ctx.args.servers.split(",") if s.startswith(host)
+        ]
+        trainers = [
+            s for s in self.ctx.args.trainers.split(",") if s.startswith(host)
+        ]
+        server_num = len(servers)
+        trainer_num = len(trainers)
+
+        self.pod.replicas = server_num + trainer_num
+
+        self.save_pod_log([server_endpoints, trainer_endpoints])
+
+        import tempfile
+        gloo_rendezvous_dir = tempfile.mkdtemp()
+        if os.path.exists(gloo_rendezvous_dir):
+            shutil.rmtree(gloo_rendezvous_dir)
+
+        gloo_port = self.ctx.args.gloo_port
+        gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port)
+
+        _gloo_envs = {
+            "PADDLE_GLOO_RENDEZVOUS": "3",
+            "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir,
+            "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http,
+            "PADDLE_WITH_GLOO": self.ctx.args.with_gloo
+        }
+
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": self.ctx.args.servers,
+                "PADDLE_TRAINER_ENDPOINTS": self.ctx.args.trainers,
+                "PADDLE_PORT": servers[i].split(":")[1],
+                "PADDLE_ROLE": "PSERVER",
+                "TRAINING_ROLE": "PSERVER",
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        trainer_rank_offset = 0
+        for s in trainer_endpoints:
+            if s.startswith(host):
+                break
+            else:
+                trainer_rank_offset += 1
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT": trainers[i].split(":")[1],
+                "PADDLE_ROLE": "TRAINER",
+                "TRAINING_ROLE": "TRAINER",
+                "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+    def _build_pod_with_master(self):
+
+        self.pod.rank = self.ctx.args.rank
+
+        server_num = self.ctx.args.server_num or 1
+        servers = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(server_num)
+        ]
+        trainer_num = self.ctx.args.trainer_num or 1
+        trainers = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(trainer_num)
+        ]
+
+        data = json.dumps({
+            'name': self.pod.name,
+            'rank': self.pod.rank,
+            'servers': servers,
+            'trainers': trainers,
+            'dtype': self.ctx.node.device.dtype,
+            'gloo_port': self.ctx.node.get_free_port(),
+        })
+
+        peer_list, rank = self.master.sync_peers(
+            '/{}/info'.format(self.job.id), self.pod.name, data,
+            self.job.replicas, self.pod.rank)
+
+        self.ctx.logger.debug("sync peers done {}".format(peer_list))
+
+        peer_list = [json.loads(i) for i in peer_list]
+
+        self.save_pod_log(peer_list)
+
+        server_endpoints = [j for i in peer_list for j in i['servers']]
+        trainer_endpoints = [j for i in peer_list for j in i['trainers']]
+        #rank_offset = sum([i['replicas'] for i in peer_list[:rank]])
+
+        server_rank_offset = sum([len(i['servers']) for i in peer_list[:rank]])
+        trainer_rank_offset = sum(
+            [len(i['trainers']) for i in peer_list[:rank]])
+
+        self.pod.rank = rank
+
+        self.pod.replicas = server_num + trainer_num
+
+        import tempfile
+        gloo_rendezvous_dir = tempfile.mkdtemp()
+        if os.path.exists(gloo_rendezvous_dir):
+            shutil.rmtree(gloo_rendezvous_dir)
+
+        gloo_port = peer_list[0]['gloo_port']
+        gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port)
+
+        _gloo_envs = {
+            "PADDLE_GLOO_RENDEZVOUS": "3",
+            "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir,
+            "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http,
+            "PADDLE_WITH_GLOO": self.ctx.args.with_gloo
+        }
+
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT":
+                server_endpoints[i + server_rank_offset].split(":")[1],
+                "PADDLE_ROLE": "PSERVER",
+                "TRAINING_ROLE": "PSERVER",
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_PORT":
+                trainer_endpoints[i + trainer_rank_offset].split(":")[1],
+                "PADDLE_ROLE": "TRAINER",
+                "TRAINING_ROLE": "TRAINER",
+                "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)),
+                "POD_IP": self.ctx.node.ip,
+            }
+            e.update(_gloo_envs)
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+        ''' NEW VERSION
+        for i in range(server_num):
+            e = {
+                "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_ROLE": "PSERVER",
+                "PADDLE_RANK": "{}".format(i + server_rank_offset),
+            }
+            log_tag = "ps.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+
+        for i in range(trainer_num):
+            e = {
+                "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints),
+                "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
+                "PADDLE_ROLE": "TRAINER_CPU",
+                "PADDLE_RANK": "{}".format(i + trainer_rank_offset),
+            }
+            log_tag = "trainer.{}".format(i)
+            self.add_container(envs=e, log_tag=log_tag)
+        '''
diff --git a/python/paddle/distributed/launch/job/__init__.py b/python/paddle/distributed/launch/job/__init__.py
new file mode 100644
index 0000000000000..97043fd7ba688
--- /dev/null
+++ b/python/paddle/distributed/launch/job/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/launch/job/container.py b/python/paddle/distributed/launch/job/container.py
new file mode 100644
index 0000000000000..7105cae9024f2
--- /dev/null
+++ b/python/paddle/distributed/launch/job/container.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from paddle.distributed.launch.utils.process_context import ProcessContext
+
+from .status import Status
+
+import os, copy, sys
+
+
+class Container(object):
+    '''
+    TODO(kuizhiqing) A container can be run by process/thread or just a callable function
+    '''
+
+    def __init__(self, entrypoint=[], rank=-1, env={}):
+        self._entrypoint = entrypoint
+        self._rank = rank
+        self._out = None
+        self._err = None
+        self._env = env
+        self._proc = None
+
+        self._retry: int = 3
+        self._grace_period = 10
+
+        self._log_handler = None
+
+    @property
+    def entrypoint(self):
+        return self._entrypoint
+
+    @entrypoint.setter
+    def entrypoint(self, entry):
+        self._entrypoint = entry
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @rank.setter
+    def rank(self, r):
+        self._rank = r
+
+    @property
+    def outfile(self):
+        return self._out
+
+    @outfile.setter
+    def outfile(self, out):
+        self._out = out
+
+    @property
+    def errfile(self):
+        return self._err
+
+    @errfile.setter
+    def errfile(self, err):
+        self._err = err
+
+    def update_env(self, env={}, **kwargs):
+        env = {k: v for k, v in env.items() if isinstance(v, str)}
+        self._env.update(env)
+
+        kwargs = {k: v for k, v in kwargs.items() if isinstance(v, str)}
+        self._env.update(kwargs)
+
+    def _valide_env(self):
+        for k, v in self._env.items():
+            assert isinstance(k, str) and isinstance(
+                v, str), 'env {}:{} must be str'.format(k, v)
+
+    def _get_fd(self, pth):
+        if not pth:
+            return None
+
+        try:
+            d = os.path.dirname(pth)
+            if not os.path.isdir(d):
+                os.makedirs(d, exist_ok=True)
+            return open(pth, 'w')
+        except:
+            return None
+
+    def start(self):
+        if self._proc and self._proc.alive():
+            return True
+
+        self._valide_env()
+
+        self._stdout = self._get_fd(self._out) or sys.stdout
+        if self._out == self._err:
+            self._stderr = self._stdout
+        elif self._err:
+            self._stderr = self._get_fd(self._err) or sys.stderr
+
+        self._proc = ProcessContext(
+            self._entrypoint, env=self._env, out=self._stdout, err=self._stderr)
+        self._proc.start()
+
+    def terminate(self, force=False):
+        if self._log_handler:
+            self._log_handler.close()
+            self._log_handler = None
+
+        if self._proc and self._proc.alive():
+            return self._proc.terminate(force)
+
+    def wait(self, timeout=None):
+        self._proc.wait(timeout)
+
+    @property
+    def exit_code(self):
+        return self._proc.exit_code() if self._proc else -1
+
+    @property
+    def status(self):
+        if not self._proc:
+            return Status.UNINIT
+        if self._proc.alive():
+            return Status.RUNNING
+        elif self._proc.exit_code() == 0:
+            return Status.COMPLETED
+        else:
+            return Status.FAILED
+
+    def __str__(self):
+        return 'Container rank {} status {} cmd {} code {} log {} \nenv {}'.format(
+            self._rank,
+            self.status,
+            self._entrypoint,
+            self.exit_code,
+            self.errfile,
+            self._env, )
+
+    def logs(self, fn=None, offset=0, whence=1, lines=1000):
+        if not self._log_handler:
+            self._log_handler = open(self._out)
+
+        if fn is None:
+            fn = sys.stdout
+
+        self._log_handler.seek(offset, whence)
+
+        try:
+            idx = 0
+            for line in self._log_handler:
+                fn.write(line)
+                idx += 1
+                if idx > lines:
+                    break
+        finally:
+            return self._log_handler.tell()
+
+    def tail(self, length=3000):
+        if not self._log_handler:
+            self._log_handler = open(self._out)
+
+        self._log_handler.seek(0, 2)
+        ed = self._log_handler.tell()
+
+        if ed > length:
+            self.logs(offset=ed - length, whence=0)
+        else:
+            self.logs(offset=0, whence=0)
diff --git a/python/paddle/distributed/launch/job/job.py b/python/paddle/distributed/launch/job/job.py
new file mode 100644
index 0000000000000..31827968ddce6
--- /dev/null
+++ b/python/paddle/distributed/launch/job/job.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class JobMode:
+    COLLECTIVE = 'collective'
+    PS = 'ps'
+    HETER = 'heter'
+
+
+class Job(object):
+    def __init__(self, jid='default', mode=JobMode.COLLECTIVE, nnodes="1"):
+        self._mode = mode
+        self._id = jid
+
+        self._replicas = 0
+        self._replicas_min = self._replicas
+        self._replicas_max = self._replicas
+        self._elastic = False
+
+        self.set_replicas(str(nnodes))
+
+    def __str__(self):
+        return "Job: {}, mode {}, replicas {}[{}:{}], elastic {}".format(
+            self.id, self.mode, self._replicas, self._replicas_min,
+            self._replicas_max, self.elastic)
+
+    @property
+    def mode(self):
+        return self._mode
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def elastic(self):
+        return self._elastic
+
+    @property
+    def replicas(self):
+        return self._replicas
+
+    @property
+    def replicas_min(self):
+        return self._replicas_min
+
+    @property
+    def replicas_max(self):
+        return self._replicas_max
+
+    @replicas.setter
+    def replicas(self, replicas):
+        self._replicas = replicas
+
+    def set_replicas(self, nnodes: str):
+        np = str(nnodes) if nnodes else '1'
+
+        if ':' in np:
+            nps = np.split(':')
+            self._replicas_min, self._replicas_max = int(nps[0]), int(nps[1])
+            self._replicas = self._replicas_max  # default to max
+
+            self._elastic = True
+        else:
+            self._replicas = int(np)
+            self._replicas_min, self._replicas_max = self._replicas, self._replicas
+
+            self._elastic = False
diff --git a/python/paddle/distributed/launch/job/pod.py b/python/paddle/distributed/launch/job/pod.py
new file mode 100644
index 0000000000000..701adf45f94e8
--- /dev/null
+++ b/python/paddle/distributed/launch/job/pod.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from .container import Container
+
+from .status import Status
+
+import random
+import time
+
+
+class PodSepc(object):
+    def __init__(self):
+        self._name = ''.join(
+            random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(6))
+
+        # by controller
+        self._init_containers: List[Container] = []
+        self._containers: List[Container] = []
+
+        #self.resource: Resource = None
+        #self.status: Status = None
+
+        self._rank = -1
+        self._init_timeout = None
+        self._restart = -1
+        self._replicas = 0  # number of containers
+        self._exit_code = 0
+
+
+class Pod(PodSepc):
+    def __init__(self):
+        super().__init__()
+
+    def __str__(self):
+        return "Pod: {}, replicas {}, status {}".format(
+            self.name, self.replicas, self.status)
+
+    def failed_container(self):
+        cs = []
+        for c in self._containers:
+            if c.status == Status.FAILED:
+                cs.append(c)
+        return cs
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def replicas(self):
+        return self._replicas
+
+    @replicas.setter
+    def replicas(self, r):
+        self._replicas = max(r, 1)
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @rank.setter
+    def rank(self, r):
+        self._rank = r
+
+    @property
+    def restart(self):
+        return self._restart
+
+    @property
+    def containers(self):
+        return self._containers
+
+    def add_container(self, c):
+        c.rank = len(self._containers)
+        self._containers.append(c)
+
+    @property
+    def init_containers(self):
+        return self._init_containers
+
+    def add_init_container(self, c):
+        c.rank = len(self._init_containers)
+        self._init_containers.append(c)
+
+    @property
+    def exit_code(self):
+        for c in self._containers:
+            if c.exit_code != 0:
+                return c.exit_code
+        return 0
+
+    def deploy(self):
+        # init container should stop before run containers
+        for i in self._init_containers:
+            i.start()
+            i.wait(self._init_timeout)
+
+        for c in self._containers:
+            c.start()
+
+        self._restart += 1
+
+    def stop(self, sigint=0):
+        for c in self._containers:
+            force = True if sigint == 9 else False
+            c.terminate(force)
+
+    def join(self):
+        for c in self._containers:
+            c.wait(None)
+
+    @property
+    def status(self):
+        if self.is_failed():
+            return Status.FAILED
+
+        if self.is_completed():
+            return Status.COMPLETED
+
+        if self.is_running():
+            return Status.RUNNING
+
+        return Status.READY
+
+    def reset(self):
+        self._init_containers = []
+        self._containers = []
+
+    def is_failed(self):
+        for c in self._containers:
+            if c.status == Status.FAILED:
+                return True
+        return False
+
+    def is_completed(self):
+        for c in self._containers:
+            if c.status != Status.COMPLETED:
+                return False
+        return True
+
+    def is_running(self):
+        for c in self._containers:
+            if c.status != Status.RUNNING:
+                return False
+        return True
+
+    def logs(self, idx=None):
+        if idx is None:
+            self._containers[0].logs()
+        else:
+            self._containers[idx].logs()
+
+    def tail(self, idx=None):
+        if idx is None:
+            self._containers[0].tail()
+        else:
+            self._containers[idx].tail()
+
+    def watch(self,
+              all_list=[Status.COMPLETED],
+              any_list=[Status.FAILED],
+              interval=1,
+              timeout=-1):
+        '''
+        watch return if any container status in any_list
+        or all container status in all_list
+        '''
+        end = time.time() + timeout
+        while timeout < 0 or time.time() < end:
+            for c in self._containers:
+                if c.status in any_list:
+                    return c.status
+
+            s = [c.status for c in self._containers]
+            if len(set(s)) == 1 and s[0] in all_list:
+                return s[0]
+
+            time.sleep(interval)
diff --git a/python/paddle/distributed/launch/job/status.py b/python/paddle/distributed/launch/job/status.py
new file mode 100644
index 0000000000000..ae10c5adb6cbf
--- /dev/null
+++ b/python/paddle/distributed/launch/job/status.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Status(object):
+    UNINIT = "uninit"
+    READY = "ready"
+    RUNNING = "running"
+    FAILED = "failed"
+    TERMINATING = "terminating"
+    RESTARTING = "restarting"
+    UNKNOWN = "unknown"
+    COMPLETED = "completed"
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
new file mode 100644
index 0000000000000..e6febff505e52
--- /dev/null
+++ b/python/paddle/distributed/launch/main.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .context import Context
+
+
+def launch():
+    """
+    Paddle distribution training entry ``python -m paddle.distributed.launch``.
+    
+    Usage:
+        .. code-block:: bash
+            :name: code-block-bash1
+
+            python -m paddle.distributed.launch [-h] [--master MASTER] [--rank RANK]
+                   [--log_level LOG_LEVEL] [--nnodes NNODES]
+                   [--nproc_per_node NPROC_PER_NODE] [--log_dir LOG_DIR]
+                   [--run_mode RUN_MODE] [--job_id JOB_ID] [--devices DEVICES]
+                   [--host HOST] [--servers SERVERS] [--trainers TRAINERS]
+                   [--trainer_num TRAINER_NUM] [--server_num SERVER_NUM]
+                   [--gloo_port GLOO_PORT] [--with_gloo WITH_GLOO]
+                   [--max_restart MAX_RESTART] [--elastic_level ELASTIC_LEVEL]
+                   [--elastic_timeout ELASTIC_TIMEOUT]
+                   training_script ...
+
+
+    Base Parameters:
+        - ``--master``: The master/rendezvous server, support http:// and etcd://, default with http://. e.g., ``--master=127.0.0.1:8080``. Default ``--log_dir=None``.
+
+        - ``--rank``: The rank of the node, can be auto assigned by master. Default ``--rank=-1``.
+
+        - ``--log_level``: The log levl to set for logging.setLevel. Default ``--log_level=INFO``.
+
+        - ``--nnodes``: The number of nodes for a distributed job, it can be a range in elastic mode, e.g., ``--nnnodes=2:3``. Default ``--nnodes=1``.
+
+        - ``--nproc_per_node``: The number of processes to launch on a node. In gpu training, it should be less or equal to the gpus number of you system.  e.g., ``--nproc_per_node=8``
+
+        - ``--log_dir``: The path for each process's log. e.g., ``--log_dir=output_dir``. Default ``--log_dir=log``.
+
+        - ``--run_mode``: The run mode of job, can be:collective/ps/ps-heter. e.g., ``--run_mode=ps``. Default ``--run_mode=collective``.
+
+        - ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``.
+
+        - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
+
+        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
+
+        - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
+
+    Collective Parameters:
+        - ``--ips``: [DEPRECATED] Paddle cluster nodes ips, e.g., ``--ips=192.168.0.16,192.168.0.17``. Default ``--ips=127.0.0.1``.
+
+    Parameter-Server Parameters:
+        - ``--servers``: User defined servers ip:port, e.g., ``--servers="192.168.0.16:6170,192.168.0.17:6170"``
+
+        - ``--trainers``: User defined trainers ip:port, e.g., ``--trainers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172"``
+
+        - ``--workers``: [DEPRECATED] The same as trainers.
+
+        - ``--trainer_num``: Number of trainers on each node, can be 0.
+
+        - ``--worker_num``: [DEPRECATED] The same as trainer_num.
+
+        - ``--server_num``: Number of servers on each node, can be 0.
+
+        - ``--heter_workers``: User defined heter workers ip1:port1;ip2:port2, e.g., ``--heter_workers="192.168.0.16:6172;192.168.0.17:6172"``
+
+        - ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)
+        
+        - ``--heter_devices``: Type of heter_device in each stage
+
+        - ``--gloo_port``: Gloo http Port. Default ``--gloo_port=6767``.
+
+        - ``--with_gloo``: Using gloo or not. Default ``--with_gloo=0``.
+
+    Elastic Parameters:
+        - ``--max_restart``: The maximum restart times for an elastic job. Default ``--max_restart=3``.
+
+        - ``--elastic_level``: The elastic level: -1: disable, 0: failed exit, peers hold, 1: internal restart. Default ``--elastic_level=-1``.
+
+        - ``--elastic_timeout``: Seconds to wait before elastic job begin to train. Default ``--elastic_timeout=30``.
+
+
+    Returns:
+        ``None``
+
+    Examples 0 (master, ip/port auto detection):
+
+            # For training on multi node, run the following command in one of the nodes
+
+            python -m paddle.distributed.launch --nnodes 2 train.py
+
+            # Then the following info will be print
+
+            # Copy the following command to other nodes to run.
+            # --------------------------------------------------------------------------------
+            # python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
+            # --------------------------------------------------------------------------------
+
+            # Follow the instruction above and paste the command in other nodes can launch a multi nodes training job.
+
+            # There are two ways to launch a job with the same command for multi nodes training
+            # 1) using the following command in every nodes, make sure the ip is one of the training node and the port is available on that node
+            # python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
+            # 2) using the following command in every nodes with a independent etcd service
+            # python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2 train.py
+
+            # This functionality works will for both collective and ps mode and even with other arguments.
+
+
+    Examples 1 (collective, single node):
+        .. code-block:: bash
+            :name: code-block-example-bash1
+            
+            # For training on single node using 4 gpus.
+
+            python -m paddle.distributed.launch --devices=0,1,2,3 train.py --lr=0.01
+        
+    Examples 2 (collective, multi node):
+        .. code-block:: bash
+            :name: code-block-example-bash2
+
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 
+
+            # On 192.168.0.16:
+
+            python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01
+
+            # On 192.168.0.17:
+            python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01
+        
+    Examples 3 (ps, cpu, single node):
+        .. code-block:: bash
+            :name: code-block-example-bash3
+
+            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers.
+            
+            python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
+        
+    Examples 4 (ps, cpu, multi node):
+        .. code-block:: bash
+            :name: code-block-example-bash4
+
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.
+
+            # On 192.168.0.16:
+
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
+
+            # On 192.168.0.17:
+
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
+
+            # Or with master, the following command run 2 server and 2 trainer on each node.
+
+            python -m paddle.distributed.launch --master 192.168.0.16:9090 --server_num=2 --trainer_num=2 --nnodes 2 train.py
+
+
+    Examples 5 (ps, gpu, single node):
+        .. code-block:: bash
+            :name: code-block-example-bash5
+
+           # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.
+            
+            export CUDA_VISIBLE_DEVICES=0,1,2,3
+            python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
+            
+    Examples 6 (ps, gpu, multi node):
+        .. code-block:: bash
+            :name: code-block-example-bash6
+
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.
+
+            # On 192.168.0.16:
+
+            export CUDA_VISIBLE_DEVICES=0,1
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
+
+            # On 192.168.0.17:
+
+            export CUDA_VISIBLE_DEVICES=0,1
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
+
+    Examples 7 (ps-heter, cpu + gpu, single node):
+        .. code-block:: bash
+            :name: code-block-example-bash7
+
+            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.
+            
+            export CUDA_VISIBLE_DEVICES=0,1
+            python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01
+            
+    Examples 8 (ps-heter, cpu + gpu, multi node):
+        .. code-block:: bash
+            :name: code-block-example-bash8
+
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server, 1 gpu worker, 1 cpu worker.
+
+            # On 192.168.0.16:
+
+            export CUDA_VISIBLE_DEVICES=0
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01
+
+            # On 192.168.0.17:
+
+            export CUDA_VISIBLE_DEVICES=0
+            python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01
+
+    Examples 9 (elastic):
+        .. code-block:: bash
+            :name: code-block-example-bash9
+
+            # With the following command, the job will begin to run immediately if 4 nodes are ready,
+            # or it will run after elastic_timeout if only 2 or 3 nodes ready
+            python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:4 train.py
+            
+            # once the number of nodes changes between 2:4 during training, the strategy holds
+        
+    """
+
+    # initialize the context to run
+    ctx = Context()
+
+    if ctx.is_legacy_mode():
+
+        # legacy mode
+        from paddle.distributed.fleet import launch
+        launch.launch()
+
+    else:
+
+        from . import controllers
+
+        # initialize the selected controller
+        c = controllers.init(ctx)
+
+        # run the pods
+        c.run()
+
+        # manager or just wait pod
+        c.finalize()
+
+
+if __name__ == "__main__":
+    launch()
diff --git a/python/paddle/distributed/launch/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py
new file mode 100644
index 0000000000000..1862f75a77f65
--- /dev/null
+++ b/python/paddle/distributed/launch/plugins/__init__.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+
+__all__ = []
+
+
+def log(ctx):
+    ctx.logger.info("-----------  Configuration  ----------------------")
+    for arg, value in sorted(six.iteritems(vars(ctx.args))):
+        ctx.logger.info("%s: %s" % (arg, value))
+    ctx.logger.info("--------------------------------------------------")
+
+
+def process_args(ctx):
+    # reset device by args
+    #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
+    argdev = ctx.args.devices
+    if argdev:
+        ctx.node.device.labels = argdev.split(',')
+        ctx.logger.debug('Device reset by args {}'.format(argdev))
+
+
+def collective_compatible(ctx):
+    if 'PADDLE_TRAINER_ENDPOINTS' in ctx.envs:
+        eps = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')
+        hosts = set([h.split(':')[0] for h in eps])
+        ctx.args.master = eps[0] if ':' in eps[0] else '{}:6768'.format(eps[0])
+        ctx.args.nnodes = len(hosts)
+        ctx.logger.info('args reset by env PADDLE_TRAINER_ENDPOINTS\n{}'.format(
+            eps))
+    '''
+    if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs:
+        eps = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')
+        hosts = set([h.split(':')[0] for h in eps])
+        ctx.args.master = eps[0]
+        ctx.args.nnodes = len(hosts)
+        ctx.logger.info(
+            'args reset by env DISTRIBUTED_TRAINER_ENDPOINTS\n{}'.format(eps))
+    '''
+
+
+def rewrite_host_ip(ctx):
+    if ctx.args.host is not None and "." in ctx.args.host:
+        ctx.logger.warning('Host ip reset to {}'.format(ctx.args.host))
+        ctx.node.ip = ctx.args.host
+
+
+enabled_plugins = [collective_compatible, rewrite_host_ip, process_args, log]
diff --git a/python/paddle/distributed/launch/utils/__init__.py b/python/paddle/distributed/launch/utils/__init__.py
new file mode 100644
index 0000000000000..97043fd7ba688
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/launch/utils/kv_client.py b/python/paddle/distributed/launch/utils/kv_client.py
new file mode 100644
index 0000000000000..e19195412268a
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/kv_client.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import time
+
+
+class KVClient(object):
+    def __init__(self, endpoint='localhost:2379'):
+        self.endpoint = endpoint if endpoint.startswith(
+            "http://") else "http://{}".format(endpoint)
+
+    def put(self, key, value):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.post(u, data=value, timeout=3)
+            if r.status_code == 200:
+                return True
+            else:
+                return False
+        except:
+            return False
+
+    def get(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.get(u, timeout=3)
+            if r.status_code == 200:
+                ret = r.json()
+                return ret.get(key, '')
+            else:
+                return "error"
+        except:
+            return ""
+
+    def get_prefix(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.get(u, timeout=3)
+            if r.status_code == 200:
+                return r.json()
+        except:
+            return ""
+
+    def delete(self, key):
+        key = key if key.startswith('/') else "/{}".format(key)
+        u = "{}{}".format(self.endpoint, key)
+        try:
+            r = requests.delete(u, timeout=3)
+            if r.status_code == 200:
+                return True
+            else:
+                return False
+        except:
+            return False
+
+    def wait_server_ready(self, timeout=3):
+        end = time.time() + timeout
+        while time.time() < end:
+            if self.get("/healthy") == "ok":
+                return True
+
+
+if __name__ == '__main__':
+    cli = PKVClient("http://localhost:8090")
+    data = {"/workers/1": "rank1", "/workers/2": "rank2"}
+    for k, v in data.items():
+        cli.put(k, v)
+    x = cli.get_prefix("/workers")
+    print(x)
+    for k, v in data.items():
+        assert x[k] == v
+
+    cli.put("key", "value")
+    print(cli.get("key"))
+    assert cli.get("key") == "value"
+    cli.delete("key")
+    print(cli.get("/key"))
+    print(cli.get("/healthy"))
+    assert cli.get("/healthy") == "ok"
diff --git a/python/paddle/distributed/launch/utils/kv_server.py b/python/paddle/distributed/launch/utils/kv_server.py
new file mode 100644
index 0000000000000..2d7ae15f13d63
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/kv_server.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from http.server import HTTPServer
+import http.server as SimpleHTTPServer
+
+from multiprocessing import Process
+
+import threading
+import json
+
+
+class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+    def do_GET(self):
+        with self.server.kv_lock:
+            ret = {}
+            for k, v in self.server.kv.items():
+                if k.startswith(self.path):
+                    ret[k] = v.decode(encoding="utf-8")
+            if ret:
+                self.output(200, json.dumps(ret).encode("utf-8"))
+            else:
+                self.output(404)
+
+    def do_PUT(self):
+        self.do_POST()
+
+    def do_POST(self):
+        content_length = int(self.headers['Content-Length'] or 0)
+        try:
+            value = self.rfile.read(content_length)
+            with self.server.kv_lock:
+                self.server.kv[self.path] = value
+                self.output(200)
+                return
+        except:
+            self.output(500)
+
+    def do_DELETE(self):
+        with self.server.kv_lock:
+            if self.path in self.server.kv:
+                del self.server.kv[self.path]
+                self.output(200)
+            else:
+                self.output(404)
+
+    def output(self, code, value=''):
+        self.send_response(code)
+        self.send_header("Content-Length", len(value))
+        self.send_header("Content-Type", "application/json; charset=utf8")
+        self.end_headers()
+        if value:
+            self.wfile.write(value)
+
+    def log_message(self, format, *args):
+        return
+
+
+class KVServer(HTTPServer, object):
+    def __init__(self, port):
+        super(KVServer, self).__init__(('', port), KVHandler)
+        self.kv_lock = threading.Lock()
+        self.kv = {'/healthy': b'ok'}
+        self.port = port
+        self.stopped = False
+        self.started = False
+
+    def start(self):
+        self.listen_thread = threading.Thread(target=self.serve_forever)
+        self.listen_thread.start()
+        self.started = True
+
+    def stop(self):
+        self.shutdown()
+        self.listen_thread.join()
+        self.server_close()
+        self.stopped = True
+
+
+class PKVServer():
+    def __init__(self, port):
+        self._server = KVServer(port)
+
+    def start(self):
+        self.proc = Process(target=self._server.start)
+        self.proc.daemon = True
+        self.proc.start()
+
+    def stop(self):
+        self._server.stop()
+        self.proc.join()
+
+    @property
+    def started(self):
+        return self._server.started
+
+    @property
+    def stopped(self):
+        return self._server.stopped
+
+
+if __name__ == '__main__':
+    #kv = PKVServer(8090)
+    kv = KVServer(8090)
+    kv.start()
+    import time
+
+    #print("serve at 8090 for 600 s")
+
+    time.sleep(600)
diff --git a/python/paddle/distributed/launch/utils/process_context.py b/python/paddle/distributed/launch/utils/process_context.py
new file mode 100644
index 0000000000000..4d6fa8de794ff
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/process_context.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import os, sys, signal, time
+
+
+class ProcessContext(object):
+    def __init__(self,
+                 cmd,
+                 env=os.environ,
+                 out=sys.stdout,
+                 err=sys.stderr,
+                 group=True,
+                 preexec_fn=None):
+        self._cmd = cmd
+        self._env = env
+        self._preexec_fn = preexec_fn
+        self._stdout = out
+        self._stderr = err
+        self._group = group if os.name != 'nt' else False
+        self._proc = None
+        self._code = None
+
+    def _start(self):
+        pre_fn = os.setsid if self._group else None
+        self._proc = subprocess.Popen(
+            self._cmd,
+            env=self._env,
+            stdout=self._stdout,
+            stderr=self._stderr,
+            preexec_fn=self._preexec_fn or pre_fn)
+
+    def _close_std(self):
+        try:
+            if not self._stdout.isatty():
+                self._stdout.close()
+
+            if not self._stderr.isatty():
+                self._stderr.close()
+        except:
+            pass
+
+    def alive(self):
+        return self._proc and self._proc.poll() is None
+
+    def exit_code(self):
+        return self._proc.poll() if self._proc else None
+
+    def start(self):
+        self._start()
+
+    def terminate(self, force=False, max_retry=3):
+        for i in range(max_retry):
+            if self.alive():
+                if self._group:
+                    os.killpg(os.getpgid(self._proc.pid), signal.SIGTERM)
+                else:
+                    self._proc.terminate()
+                time.sleep(0.2)
+            else:
+                break
+
+        if force and self.alive():
+            self._proc.kill()
+
+        self._close_std()
+
+        return self.alive()
+
+    def wait(self, timeout=None):
+        self._proc.wait(timeout)
diff --git a/python/paddle/distributed/models/__init__.py b/python/paddle/distributed/models/__init__.py
new file mode 100644
index 0000000000000..e1663029ef1f8
--- /dev/null
+++ b/python/paddle/distributed/models/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/models/moe/__init__.py b/python/paddle/distributed/models/moe/__init__.py
new file mode 100644
index 0000000000000..e1663029ef1f8
--- /dev/null
+++ b/python/paddle/distributed/models/moe/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
new file mode 100644
index 0000000000000..fd98c64318c60
--- /dev/null
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import in_dygraph_mode
+
+
+def _number_count(gate_idx, upper_range):
+    """
+    calculate the expert count according to the gate index.
+    Args:
+        gate_idx (Tensor): Tensor. The input gate index whose data type should be int32 or int64.
+        upper_range (int): The number of the experts.
+    Returns:
+        out (Tensor): The output expert count.
+    Examples:
+        .. code-block:: python
+            # required: distributed
+            import paddle
+
+            gate_idx = [
+                [0, 2],
+                [0, 2]
+            ]
+            upper_range = 6
+            gate_idx = paddle.to_tensor(gate_idx, dtype="int32")
+            number_count = paddle.distributed.utils.number_count(gate_idx, upper_range)
+            print(number_count) # the result: [2, 0, 2, 0, 0, 0]
+    """
+    if in_dygraph_mode():
+        return core.ops.number_count(gate_idx, 'upper_range', upper_range)
+    else:
+        op_type = 'number_count'
+
+        helper = LayerHelper(op_type, **locals())
+        out = helper.create_variable_for_type_inference(dtype=gate_idx.dtype)
+
+        helper.append_op(
+            type=op_type,
+            inputs={'gate_idx': gate_idx},
+            outputs={'Out': out},
+            attrs={'upper_range': upper_range})
+        return out
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 7480909a2d88d..fb9e8d8ece100 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -228,3 +228,5 @@ def remove_flag_if_exists(name):
 atexit.register(core.clear_executor_cache)
 # NOTE(Aganlengzi): clean up KernelFactory in advance manually.
 atexit.register(core.clear_kernel_factory)
+# NOTE(wangran16): clean up DeviceManger in advance manually.
+atexit.register(core.clear_device_manager)
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index b8a696057e780..d21b7e4740a6e 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -542,7 +542,7 @@ def __init__(self):
     def set_graph_config(self,
                          num_ipus=1,
                          is_training=True,
-                         batch_size=1,
+                         micro_batch_size=1,
                          enable_manual_shard=False):
         """
         Set graph configuration to the IpuStrategy instance.
@@ -571,7 +571,7 @@ def set_graph_config(self,
                 ipu_strategy = static.IpuStrategy()
                 ipu_strategy.set_graph_config(num_ipus=1,
                                             is_training=True,
-                                            batch_size=1,
+                                            micro_batch_size=1,
                                             enable_manual_shard=False)
         """
         if num_ipus == 1 and enable_manual_shard:
@@ -581,7 +581,7 @@ def set_graph_config(self,
         options = {
             'num_ipus': num_ipus,
             'is_training': is_training,
-            'micro_batch_size': batch_size,
+            'micro_batch_size': micro_batch_size,
             'enable_manual_shard': enable_manual_shard,
         }
         self.set_options(options)
@@ -589,6 +589,7 @@ def set_graph_config(self,
     def set_pipelining_config(self,
                               enable_pipelining=False,
                               batches_per_step=1,
+                              enable_gradient_accumulation=False,
                               accumulation_factor=1):
         """
         Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance.
@@ -598,6 +599,8 @@ def set_pipelining_config(self,
                 Default False, which means disabled.
             batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1.
                 Default 1, which means no data pipelining.
+            enable_gradient_accumulation (bool, optional): Enable to accumulate gradients before updating the weights in training mode. Only if enable_pipelining=True,
+                enable_gradient_accumulation is able to be set True. Default False, which means no gradient accumulation. 
             accumulation_factor (int, optional): Specify the number of micro-batches to accumulate 
                 before applying the varUpdate. Default 1, which means disable the accumulation.
         
@@ -617,6 +620,7 @@ def set_pipelining_config(self,
                 ipu_strategy = static.IpuStrategy()
                 ipu_strategy.set_pipelining_config(enable_pipelining=False,
                                                     batches_per_step=1,
+                                                    enable_gradient_accumulation=False,
                                                     accumulation_factor=1)
         """
         enable_manual_shard = self.get_option('enable_manual_shard')
@@ -627,6 +631,7 @@ def set_pipelining_config(self,
         options = {
             'enable_pipelining': enable_pipelining,
             'batches_per_step': batches_per_step,
+            'enable_gradient_accumulation': enable_gradient_accumulation,
             'accumulation_factor': accumulation_factor,
         }
         self.set_options(options)
@@ -754,6 +759,56 @@ def get_option(self, option):
         """
         return self._ipu_strategy.get_option(option)['value']
 
+    def enable_pattern(self, pattern):
+        """
+        Enable PopART pattern to optimize the graph.
+
+        Args:
+            pattern(string): the name of the pattern.
+        
+        Returns:
+            None.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                ipu_strategy.enable_pattern("ViewSimplifyPattern")
+        """
+        self._ipu_strategy.enable_pattern(pattern)
+
+    def disable_pattern(self, pattern):
+        """
+        Disable PopART pattern.
+
+        Args:
+            pattern(string): the name of the pattern.
+        
+        Returns:
+            None.
+
+        Examples:
+            .. code-block:: python
+
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                ipu_strategy = static.IpuStrategy()
+                ipu_strategy.disable_pattern("ViewSimplifyPattern")
+        """
+        self._ipu_strategy.disable_pattern(pattern)
+
     @property
     def num_ipus(self):
         """
@@ -817,8 +872,8 @@ class IpuCompiledProgram(object):
             main_prog = static.default_main_program()
             
             ipu_strategy = static.IpuStrategy()
-            ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1)
-            ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1)
+            ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
+            ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
             ipu_strategy.set_precision_config(enable_fp16=False)
             
             ipu_compiled_program = static.IpuCompiledProgram(
@@ -891,8 +946,8 @@ def compile(self, feed_list, fetch_list):
                 main_prog = static.default_main_program()
 
                 ipu_strategy = static.IpuStrategy()
-                ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1)
-                ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1)
+                ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
+                ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
                 ipu_strategy.set_precision_config(enable_fp16=False)
                 
                 program = static.IpuCompiledProgram(
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index 80d2ccb0d5ca6..9dba5d658dfc9 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -173,6 +173,9 @@ def _update_list(self):
 elif core.is_compiled_with_npu():
     _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
         'NPU', core.VarDesc.VarType.FP16)
+elif core.is_compiled_with_mlu():
+    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
+        'MLU', core.VarDesc.VarType.FP16)
 else:
     _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
         'GPU', core.VarDesc.VarType.FP16)
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 97b4116826a2a..d614630b3db12 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -979,8 +979,6 @@ def analysis_and_save_info(op_node, out_var_name):
                 if op.type in (
                         self._quantizable_op_type + self._out_scale_op_list):
                     out_var_names = _get_op_output_var_names(op)
-                    assert len(out_var_names) == 1, "Post training " + \
-                        "quantization only support one output for " + op.type
                     for var_name in out_var_names:
                         analysis_and_save_info(op, var_name)
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index efa000274d01a..afca617b6dd82 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -59,6 +59,7 @@
     "tanh",
     "prelu",
     "swish",
+    "dropout",
     "softmax",
     "batch_norm",
     "layer_norm",
@@ -68,6 +69,8 @@
     "transpose2",
     "concat",
     "elementwise_mul",
+    "elementwise_pow",
+    "elementwise_sub",
     "scale",
     "slice",
     "hard_swish",
@@ -81,8 +84,54 @@
     "flatten2",
     "transpose",
     "pad2d",
+    "pad3d",
     "reshape",
-    "layer_norm",
+    "split",
+    "flatten_contiguous_range",
+    "squeeze",
+    "squeeze2",
+    "nearest_interp_v2",
+    "fill_constant_batch_size_like",
+    "bilinear_interp",
+    "bilinear_interp_v2",
+    "arg_max",
+    "abs",
+    "assign",
+    "cast",
+    "clip",
+    "box_coder",
+    "crop",
+    "cumsum",
+    "equal",
+    "expand_v2",
+    "fill_any_like",
+    "fill_constant",
+    "gelu",
+    "instance_norm",
+    "lookup_table",
+    "lookup_table_v2",
+    "norm",
+    "p_norm",
+    "pow",
+    "reduce_mean",
+    "stack",
+    "top_k_v2",
+    "unsqueeze",
+    "unsqueeze2",
+    "logical_and",
+    "logical_not",
+    "meshgrid",
+    "roi_align",
+    "strided_slice",
+    "where",
+    "grid_sampler",
+    "tile",
+    "group_norm",
+    "reduce_sum",
+    "square",
+    "softplus",
+    "gather",
+    "shuffle_channel",
 ]
 
 # list op real input and output names, to avoid processing input such as AxisTensor.
@@ -119,7 +168,7 @@
     "relu": [["X"], ["Out"]],
     "relu6": [["X"], ["Out"]],
     "leaky_relu": [["X"], ["Out"]],
-    "prelu": [["X"], ["Out"]],
+    "prelu": [["X", "Alpha"], ["Out"]],
     "tanh": [["X"], ["Out"]],
     "swish": [["X"], ["Out"]],
     "dropout": [["X"], ["Out"]],
@@ -127,16 +176,59 @@
     "layer_norm": [["X"], ["Y"]],
     "sigmoid": [["X"], ["Out"]],
     "elementwise_mul": [["X", "Y"], ["Out"]],
+    "elementwise_pow": [["X", "Y"], ["Out"]],
     "scale": [["X"], ["Out"]],
     "hard_swish": [["X"], ["Out"]],
     "hard_sigmoid": [["X"], ["Out"]],
     "gru": [["Input", "Weight"], ["Hidden"]],
     "lstm": [["Input", "Weight"], ["Hidden"]],
     "pad2d": [["X"], ["Out"]],
+    "pad3d": [["X"], ["Out"]],
     "flatten": [["X"], ["Out"]],
     "flatten2": [["X"], ["Out"]],
     "unsqueeze2": [["X"], ["Out"]],
-    "flatten_contiguous_range": [['X'], ["Out"]],
+    "unsqueeze2": [["X"], ["Out"]],
+    "flatten_contiguous_range": [["X"], ["Out"]],
+    "split": [["X"], ["Out"]],
+    "squeeze2": [["X"], ["Out"]],
+    "nearest_interp_v2": [["X"], ["Out"]],
+    "bilinear_interp": [["X"], ["Out"]],
+    "bilinear_interp_v2": [["X"], ["Out"]],
+    "fill_constant_batch_size_like": [["Input"], ["Out"]],
+    "arg_max": [["X"], ["Out"]],
+    "abs": [["X"], ["Out"]],
+    "assign": [["X"], ["Out"]],
+    "cast": [["X"], ["Out"]],
+    "clip": [["X"], ["Out"]],
+    "box_coder": [["PriorBox"], ["OutputBox"]],
+    "crop": [["X"], ["Out"]],
+    "cumsum": [["X"], ["Out"]],
+    "expand_v2": [["X"], ["Out"]],
+    "fill_any_like": [["X"], ["Out"]],
+    "fill_constant": [[], ["Out"]],
+    "gelu": [["X"], ["Out"]],
+    "instance_norm": [["X"], ["Out"]],
+    "lookup_table": [["W", "Ids"], ["Out"]],
+    "lookup_table_v2": [["W", "Ids"], ["Out"]],
+    "norm": [["X"], ["Norm"]],
+    "p_norm": [["X"], ["Out"]],
+    "pow": [["X"], ["Out"]],
+    "reduce_mean": [["X"], ["Out"]],
+    "stack": [["X"], ["Y"]],
+    "top_k_v2": [["X"], ["Out", "Indices"]],
+    "logical_and": [["X", "Y"], ["Out"]],
+    "logical_not": [["X"], ["Out"]],
+    "meshgrid": [["X"], ["Out"]],
+    "roi_align": [["X", "ROIs"], ["Out"]],
+    "strided_slice": [["Input"], ["Out"]],
+    "where": [["Condition", "X", "Y"], ["Out"]],
+    "grid_sampler": [["X", "Grid"], ["Output"]],
+    "tile": [["X"], ["Out"]],
+    "group_norm": [["X"], ["Y", "Mean", "Variance"]],
+    "reduce_sum": [["X"], ["Out"]],
+    "square": [["X"], ["Out"]],
+    "softplus": [["X"], ["Out"]],
+    "shuffle_channel": [["X"], ["Out"]],
 }
 
 _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
@@ -1797,14 +1889,93 @@ class AddQuantDequantPass(object):
     quantized ops's inputs.
     """
     _supported_quantizable_op_type = [
-        "pool2d", "elementwise_add", "concat", "softmax", "argmax", "transpose",
-        "equal", "gather", "greater_equal", "greater_than", "less_equal",
-        "less_than", "mean", "not_equal", "reshape", "reshape2",
-        "bilinear_interp", "nearest_interp", "trilinear_interp", "slice",
-        "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6",
-        "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2",
-        "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm",
-        "matmul_v2"
+        "pool2d",
+        "elementwise_add",
+        "concat",
+        "softmax",
+        "argmax",
+        "transpose",
+        "equal",
+        "gather",
+        "greater_equal",
+        "greater_than",
+        "less_equal",
+        "less_than",
+        "mean",
+        "not_equal",
+        "reshape",
+        "reshape2",
+        "dropout",
+        "bilinear_interp",
+        "nearest_interp",
+        "trilinear_interp",
+        "slice",
+        "squeeze",
+        "elementwise_sub",
+        "mul",
+        "matmul",
+        "relu",
+        "relu6",
+        "leaky_relu",
+        "tanh",
+        "swish",
+        "scale",
+        "transpose",
+        "transpose2",
+        "sigmoid",
+        "pad2d",
+        "flatten",
+        "flatten2",
+        "batch_norm",
+        "layer_norm",
+        "matmul_v2",
+        "split",
+        "flatten_contiguous_range",
+        "squeeze2",
+        "nearest_interp_v2",
+        "bilinear_interp",
+        "bilinear_interp_v2",
+        "fill_constant_batch_size_like",
+        "arg_max",
+        "abs",
+        "assign",
+        "cast",
+        "clip",
+        "box_coder",
+        "crop",
+        "cumsum",
+        "elementwise_mul",
+        "elementwise_pow",
+        "expand_v2",
+        "fill_any_like",
+        "fill_constant",
+        "gelu",
+        "hard_sigmoid",
+        "hard_swish",
+        "instance_norm",
+        "lookup_table",
+        "lookup_table_v2",
+        "norm",
+        "p_norm",
+        "pad3d",
+        "pow",
+        "prelu",
+        "reduce_mean",
+        "unsqueeze",
+        "unsqueeze2",
+        "logical_and",
+        "logical_not",
+        "meshgrid",
+        "roi_align",
+        "strided_slice",
+        "where",
+        "grid_sampler",
+        "tile",
+        "group_norm",
+        "reduce_sum",
+        "square",
+        "softplus",
+        "shuffle_channel",
     ]
 
     # To be compatible with PaddleSlim, not remove _activation_type for now
diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
index 3fadf25150f9e..f97c2778c0918 100644
--- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
@@ -52,6 +52,30 @@ def parse_args():
         '--debug',
         action='store_true',
         help='If used, the graph of Quant model is drawn.')
+    parser.add_argument(
+        '--quant_model_filename',
+        type=str,
+        default="",
+        help='The input model`s file name. If empty, search default `__model__` and separate parameter files and use them or in case if not found, attempt loading `model` and `params` files.'
+    )
+    parser.add_argument(
+        '--quant_params_filename',
+        type=str,
+        default="",
+        help='If quant_model_filename is empty, this field is ignored. The input model`s all parameters file name. If empty load parameters from separate files.'
+    )
+    parser.add_argument(
+        '--save_model_filename',
+        type=str,
+        default="__model__",
+        help='The name of file to save the inference program itself. If is set None, a default filename __model__ will be used.'
+    )
+    parser.add_argument(
+        '--save_params_filename',
+        type=str,
+        default=None,
+        help='The name of file to save all related parameters. If it is set None, parameters will be saved in separate files'
+    )
 
     test_args, args = parser.parse_known_args(namespace=unittest)
     return test_args, sys.argv[:1] + args
@@ -61,18 +85,29 @@ def transform_and_save_int8_model(original_path,
                                   save_path,
                                   ops_to_quantize='',
                                   op_ids_to_skip='',
-                                  debug=False):
+                                  debug=False,
+                                  quant_model_filename='',
+                                  quant_params_filename='',
+                                  save_model_filename='',
+                                  save_params_filename=''):
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     inference_scope = fluid.executor.global_scope()
     with fluid.scope_guard(inference_scope):
-        if os.path.exists(os.path.join(original_path, '__model__')):
-            [inference_program, feed_target_names,
-             fetch_targets] = fluid.io.load_inference_model(original_path, exe)
+        if not quant_model_filename:
+            if os.path.exists(os.path.join(original_path, '__model__')):
+                [inference_program, feed_target_names,
+                 fetch_targets] = fluid.io.load_inference_model(original_path,
+                                                                exe)
+            else:
+                [inference_program, feed_target_names,
+                 fetch_targets] = fluid.io.load_inference_model(
+                     original_path, exe, 'model', 'params')
         else:
             [inference_program, feed_target_names,
-             fetch_targets] = fluid.io.load_inference_model(original_path, exe,
-                                                            'model', 'params')
+             fetch_targets] = fluid.io.load_inference_model(
+                 original_path, exe, quant_model_filename,
+                 quant_params_filename)
 
         ops_to_quantize_set = set()
         print(ops_to_quantize)
@@ -97,8 +132,14 @@ def transform_and_save_int8_model(original_path,
         graph = transform_to_mkldnn_int8_pass.apply(graph)
         inference_program = graph.to_program()
         with fluid.scope_guard(inference_scope):
-            fluid.io.save_inference_model(save_path, feed_target_names,
-                                          fetch_targets, exe, inference_program)
+            fluid.io.save_inference_model(
+                save_path,
+                feed_target_names,
+                fetch_targets,
+                exe,
+                inference_program,
+                model_filename=save_model_filename,
+                params_filename=save_params_filename)
         print(
             "Success! INT8 model obtained from the Quant model can be found at {}\n"
             .format(save_path))
@@ -109,4 +150,6 @@ def transform_and_save_int8_model(original_path,
     test_args, remaining_args = parse_args()
     transform_and_save_int8_model(
         test_args.quant_model_path, test_args.int8_model_save_path,
-        test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug)
+        test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug,
+        test_args.quant_model_filename, test_args.quant_params_filename,
+        test_args.save_model_filename, test_args.save_params_filename)
diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py
index 9bf45f4272738..ec288a1287119 100644
--- a/python/paddle/fluid/contrib/sparsity/__init__.py
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
@@ -29,10 +29,11 @@
 from .asp import prune_model
 from .asp import set_excluded_layers
 from .asp import reset_excluded_layers
+from .supported_layer_list import add_supported_layer
 
 __all__ = [
     'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
     'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
     'MaskAlgo', 'CheckMethod', 'decorate', 'prune_model', 'set_excluded_layers',
-    'reset_excluded_layers'
+    'reset_excluded_layers', 'add_supported_layer'
 ]
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index ffa12ac704600..30439ad736d26 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -23,6 +23,8 @@
 from paddle.fluid import global_scope, program_guard, layers
 from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
+from paddle.fluid.contrib.sparsity.supported_layer_list import _default_pruning
 from paddle.fluid import core
 
 OpRole = core.op_proto_and_checker_maker.OpRole
@@ -292,8 +294,8 @@ class ASPHelper(object):
     2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning.
     """
 
-    MASK_APPENDDED_NAME = '_asp_mask'
-    SUPPORTED_LAYERS = {'fc': 'w_0', 'linear': 'w_0', 'conv2d': 'w_0'}
+    MASK_APPENDDED_NAME = 'asp_mask'
+    PADDLE_WEIGHT_SUFFIX = "w_"
 
     __asp_info = {}
 
@@ -334,7 +336,6 @@ def prune_model(cls,
         r"""
         This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
         """
-        checked_func_name = sparsity.CheckMethod.get_checking_method(mask_algo)
 
         if main_program is None:
             main_program = paddle.static.default_main_program()
@@ -345,33 +346,27 @@ def prune_model(cls,
                 weight_tensor = global_scope().find_var(param.name).get_tensor()
                 weight_nparray = np.array(weight_tensor)
 
-                # The double transpose ops here make sure pruning direction consistent with cuSparseLt.
-                # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
-                # cuSparseLt would prune matrix A along k dimension.
-                # In sparse training, layer weight matriices is viewed sparse matrix A, so
-                # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
-                #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed 
-                # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension 
-                # of W^T, which is m dimension of W. Moreove, all mask generating functions in 
-                # sparsity/utils is row-major pruning. That is the reason we have to transpose weight 
-                # matrices beforce invoking create_mask. Then we transpose the result maks to make 
-                # sure its shape to be the same as the input weight.
-                weight_sparse_mask = sparsity.create_mask(
-                    weight_nparray.T, func_name=mask_algo, n=n, m=m).T
-                weight_pruned_nparray = np.multiply(weight_nparray,
-                                                    weight_sparse_mask)
+                prune_func = ASPHelper._get_prune_func_by_name(param.name)
+
+                weight_pruned_nparray, weight_sparse_mask = \
+                    prune_func(weight_nparray, m, n, mask_algo, param.name)
+                weight_pruned_nparray = weight_pruned_nparray.astype(
+                    weight_nparray.dtype)
                 weight_tensor.set(weight_pruned_nparray, place)
-                assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
-                        'Pruning {} weight matrix failure!!!'.format(param.name)
+
                 if with_mask:
                     weight_mask_param = global_scope().find_var(
                         ASPHelper._get_mask_name(param.name))
                     assert weight_mask_param is not None, \
-                        'Cannot find {} variable, please call ASPHelper.minimize' \
+                        'Cannot find {} variable, please call optimizer.minimize (' \
+                        'paddle.sparsity.decorate(optimizer).minimize(loss)' \
                         ' and initialization (exe.run(startup_program)) first!'.format(ASPHelper._get_mask_name(param.name))
                     weight_mask_tensor = weight_mask_param.get_tensor()
+                    weight_sparse_mask = weight_sparse_mask.astype(
+                        np.array(weight_mask_tensor).dtype)
                     weight_mask_tensor.set(weight_sparse_mask, place)
                 asp_info.update_masks(param.name, weight_sparse_mask)
+
         return asp_info.masks.copy()
 
     @staticmethod
@@ -384,7 +379,7 @@ def _get_mask_name(param_name):
         Returns:
             string: The mask name of :attr:`param_name`.
         """
-        return param_name + ASPHelper.MASK_APPENDDED_NAME
+        return param_name + "." + ASPHelper.MASK_APPENDDED_NAME
 
     @staticmethod
     def _get_not_ASP_relevant_vars(main_program):
@@ -434,19 +429,46 @@ def _is_supported_layer(cls, main_program, param_name):
               # fc_0.w_0 -> True
               # fc_0.b_0 -> False
         """
-        if ASPHelper.MASK_APPENDDED_NAME in param_name:
+        param_name_list = param_name.split('.')
+
+        if ASPHelper.MASK_APPENDDED_NAME in param_name_list:
             return False
 
         for layer in cls._get_program_asp_info(main_program).excluded_layers:
             if layer in param_name:
                 return False
 
-        for name in ASPHelper.SUPPORTED_LAYERS:
-            if name in param_name and \
-               ASPHelper.SUPPORTED_LAYERS[name] in param_name:
-                return True
+        if param_name in supported_layers_and_prune_func_map:
+            return True
+
+        param_name_no_weight_suffix = param_name_list[0]
+        param_type_suffix = param_name_list[1]
+        layer_name = param_name_no_weight_suffix[:param_name_no_weight_suffix.
+                                                 rfind('_')]
+        if ASPHelper.PADDLE_WEIGHT_SUFFIX not in param_type_suffix:
+            return False
+
+        if param_name_no_weight_suffix in supported_layers_and_prune_func_map or \
+            layer_name in supported_layers_and_prune_func_map:
+            return True
+
         return False
 
+    @classmethod
+    def _get_prune_func_by_name(cls, param_name):
+        func = supported_layers_and_prune_func_map.get(param_name, None)
+        param_name_no_weight_suffix = param_name.split('.')[0]
+        if func is None:
+            func = supported_layers_and_prune_func_map.get(
+                param_name_no_weight_suffix, None)
+        if func is None:
+            layer_name = param_name_no_weight_suffix[:
+                                                     param_name_no_weight_suffix.
+                                                     rfind('_')]
+            func = supported_layers_and_prune_func_map.get(layer_name,
+                                                           _default_pruning)
+        return func
+
     @classmethod
     def _minimize(cls,
                   optimizer,
@@ -509,8 +531,7 @@ def _create_mask_variables(cls, main_program, startup_program,
                 if ASPHelper._is_supported_layer(main_program,
                                                  param_and_grad[0].name):
                     mask_param = layers.create_parameter(
-                        name=param_and_grad[0].name +
-                        ASPHelper.MASK_APPENDDED_NAME,
+                        name=ASPHelper._get_mask_name(param_and_grad[0].name),
                         shape=param_and_grad[0].shape,
                         dtype=param_and_grad[0].dtype,
                         default_initializer=ConstantInitializer(value=1.0))
diff --git a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
new file mode 100644
index 0000000000000..105c2ded9eee7
--- /dev/null
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle.fluid.contrib import sparsity
+import threading
+
+__all__ = ['add_supported_layer']
+
+
+def _default_pruning(weight_nparray, m, n, func_name, param_name):
+
+    checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
+
+    # The double transpose ops here make sure pruning direction consistent with cuSparseLt.
+    # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
+    # cuSparseLt would prune matrix A along k dimension.
+    # In sparse training, layer weight matrices is viewed sparse matrix A, so
+    # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
+    #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed 
+    # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension 
+    # of W^T, which is m dimension of W. Moreove, all mask generating functions in 
+    # sparsity/utils is row-major pruning. That is the reason we have to transpose weight 
+    # matrices beforce invoking create_mask. Then we transpose the result mask to make 
+    # sure its shape to be the same as the input weight.
+    weight_sparse_mask = sparsity.create_mask(
+        weight_nparray.T, func_name=func_name, n=n, m=m).T
+    weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
+    assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
+                    'Pruning {} weight matrix failure!!!'.format(param_name)
+    return weight_pruned_nparray, weight_sparse_mask
+
+
+# When value of given key in this DICT is None, 
+# ASP will call default pruning function in pruning stage.
+_supported_layers_and_prune_func_map_lock = threading.Lock()
+supported_layers_and_prune_func_map = {}
+
+
+def add_supported_layer(layer, pruning_func=None):
+    r"""
+    Add supported layers and its corresponding pruning function.
+
+    Args:
+        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then 
+        it would be turn to string internally. ASP would use this name to match parameter's name and call 
+        its the corresponding pruning function.
+        pruning_func (function, optional): a function type which receives five argument (weight_nparray,
+        m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
+        m, n, and func_name, please see `prune_model` for details.
+    """
+    name = None
+    if isinstance(layer, str):
+        name = layer
+    elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
+        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+            type(layer).__name__)
+    elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
+        name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+            layer.__name__)
+    else:
+        assert "The type of layer should be string of Layer, but got {}!".format(
+            type(layer))
+    if pruning_func is None:
+        pruning_func = _default_pruning
+    _supported_layers_and_prune_func_map_lock.acquire()
+    supported_layers_and_prune_func_map.update({name: pruning_func})
+    _supported_layers_and_prune_func_map_lock.release()
+
+
+add_supported_layer('fc')
+add_supported_layer('linear')
+add_supported_layer('conv2d')
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 706ec0d523b93..da66530f81b0a 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -30,6 +30,7 @@
 import queue
 
 import paddle
+import paddle.profiler as profiler
 from .. import core, layers
 from ..framework import in_dygraph_mode, _in_eager_mode
 from ..multiprocess_utils import _set_SIGCHLD_handler, MP_STATUS_CHECK_INTERVAL, CleanupFuncRegistrar
@@ -250,6 +251,10 @@ def _thread_loop(self, legacy_expected_place):
         self._exit_thread_expectedly()
 
     def __next__(self):
+        trace_event = profiler.RecordEvent(
+            name="_DataLoaderIterSingleProcess",
+            event_type=profiler.TracerEventType.Dataloader)
+        trace_event.begin()
         try:
             if in_dygraph_mode():
                 if _in_eager_mode():
@@ -283,6 +288,8 @@ def __next__(self):
             self._reader.shutdown()
             self._try_shutdown_all()
             six.reraise(*sys.exc_info())
+        finally:
+            trace_event.end()
 
     def _shutdown_thread(self):
         if self._thread:
@@ -564,6 +571,14 @@ def _get_data(self):
                     self._rcvd_idx += 1
                     self._batches_outstanding -= 1
                 else:
+                    # NOTE: when _rcvd_idx catch up _send_idx, which means
+                    #       one of following:
+                    #       1. all 2 * num_workers batches have been loaded
+                    #          and stored in _blocking_queue
+                    #       2. all data drained
+                    #       we need to let _thread blocking at _data_queue
+                    #       get_data to inoccupy CPU, otherwise may occupy
+                    #       CPU time for model running
                     # NOTE: in persistent workers mode, do not check data
                     #       drained here, simply let it go to _data_queue
                     #       reading to get _ResumeIteration
@@ -573,7 +588,6 @@ def _get_data(self):
                         #       may also be data in blocking queue
                         if self._batches_outstanding < len(self._places):
                             return None
-                        continue
 
             if self._rcvd_idx in self._task_infos and \
                     len(self._task_infos[self._rcvd_idx]) == 3:
@@ -688,6 +702,10 @@ def _shutdown_on_exit(self):
         self._try_shutdown_all(1)
 
     def __next__(self):
+        trace_event = profiler.RecordEvent(
+            name="_DataLoaderIterMultiProcess",
+            event_type=profiler.TracerEventType.Dataloader)
+        trace_event.begin()
         try:
             # _batches_outstanding here record the total batch data number
             # in 'from after _try_put_indices to beforeoutput data', this
@@ -736,6 +754,8 @@ def __next__(self):
                 self._reader.shutdown()
                 self._try_shutdown_all()
             six.reraise(*sys.exc_info())
+        finally:
+            trace_event.end()
 
     # python2 compatibility
     def next(self):
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 191661b7bf9d5..4127f1e4449bf 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -271,18 +271,28 @@ def amp_guard(enable=True,
             "current_tracer is None, maybe it is not in imperative mode.")
 
     # check device_type:
-    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16.
+    # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, mlu for float16, npu for float16.
     # Maybe we will support cpu for bfloat16.
     if enable and not (tracer._expected_place.is_gpu_place() or
-                       tracer._expected_place.is_xpu_place()):
+                       tracer._expected_place.is_xpu_place() or
+                       tracer._expected_place.is_mlu_place() or
+                       tracer._expected_place.is_npu_place()):
         warnings.warn(
-            'amp_guard can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
+            'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, and NPUPlace, current place is %s, so it makes no effect.'
             % tracer._expected_place)
         enable = False
+    # For npu:
+    if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'):
+        warnings.warn('NPUPlace only support float16 amp.')
+        enable = False
     # For xpu:
     if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
         warnings.warn('XPUPlace only support float16 amp.')
         enable = False
+    # For mlu:
+    if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'):
+        warnings.warn('MLUPlace only support float16 amp.')
+        enable = False
     # For gpu float16: Compute Capability should >= 7.
     # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11.
     if tracer._expected_place.is_gpu_place():
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index f7c2d6be574c4..c57290861942b 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -105,9 +105,11 @@ def __init__(self,
                 "current_tracer is None, maybe it is not in imperative mode.")
 
         if enable and not (tracer._expected_place.is_gpu_place() or
-                           tracer._expected_place.is_xpu_place()):
+                           tracer._expected_place.is_xpu_place() or
+                           tracer._expected_place.is_mlu_place() or
+                           tracer._expected_place.is_npu_place()):
             warnings.warn(
-                'AmpScaler can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
+                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and NPUPlace, current place is %s, so it makes no effect.'
                 % tracer._expected_place)
             enable = False
 
@@ -286,14 +288,28 @@ def _unscale(self, optimizer):
                     ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
                            )
             ]
-        if len(param_grads_fp16):
-            _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
-                                            param_grads_fp16,
-                                            self._temp_found_inf_fp16)
-        if len(param_grads_fp32):
-            _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
-                                            param_grads_fp32,
-                                            self._temp_found_inf_fp32)
+        if core.is_compiled_with_npu():
+            float_status = _C_ops.alloc_float_status()
+            _C_ops.clear_float_status(float_status, float_status)
+
+            if len(param_grads_fp16):
+                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                                float_status, param_grads_fp16,
+                                                self._temp_found_inf_fp16)
+            if len(param_grads_fp32):
+                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                                float_status, param_grads_fp32,
+                                                self._temp_found_inf_fp32)
+        else:
+            if len(param_grads_fp16):
+                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                                param_grads_fp16,
+                                                self._temp_found_inf_fp16)
+            if len(param_grads_fp32):
+                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                                param_grads_fp32,
+                                                self._temp_found_inf_fp32)
+
         if len(param_grads_fp16) and len(param_grads_fp32):
             self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32
         elif len(param_grads_fp16):
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 8149d69d36a27..b4c5a36d288b7 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -565,16 +565,25 @@ def check_in_out(in_out_list, name):
         if isinstance(in_out_list, (list, tuple)):
             assert len(in_out_list) > 0, "{} cannot be empty".format(name)
             for each_var in in_out_list:
-                assert isinstance(
-                    each_var,
-                    core.VarBase), "Elements of {} must be Variable".format(
-                        name)
+                if core._in_eager_mode():
+                    assert isinstance(
+                        each_var, core.eager.
+                        Tensor), "Elements of {} must be Tensor".format(name)
+                else:
+                    assert isinstance(
+                        each_var,
+                        core.VarBase), "Elements of {} must be Variable".format(
+                            name)
             return in_out_list
         else:
-            assert isinstance(
-                in_out_list,
-                core.VarBase), "{} must be Variable or list of Variable".format(
-                    name)
+            if core._in_eager_mode():
+                assert isinstance(
+                    in_out_list, core.eager.
+                    Tensor), "{} must be Tensor or list of Tensor".format(name)
+            else:
+                assert isinstance(
+                    in_out_list, core.VarBase
+                ), "{} must be Variable or list of Variable".format(name)
             return [in_out_list]
 
     outputs = check_in_out(outputs, 'outputs')
@@ -586,9 +595,14 @@ def check_in_out(in_out_list, name):
 
         for each_var in grad_outputs:
             if each_var is not None:
-                assert isinstance(
-                    each_var, core.VarBase
-                ), "grad_outputs must be None, a Variable or a list containing None or Variables"
+                if core._in_eager_mode():
+                    assert isinstance(
+                        each_var, core.eager.Tensor
+                    ), "grad_outputs must be None, a Variable or a list containing None or Variables"
+                else:
+                    assert isinstance(
+                        each_var, core.VarBase
+                    ), "grad_outputs must be None, a Variable or a list containing None or Variables"
     else:
         grad_outputs = []
 
@@ -598,16 +612,29 @@ def check_in_out(in_out_list, name):
 
     if no_grad_vars is None:
         no_grad_vars = []
-    elif isinstance(no_grad_vars, core.VarBase):
+    elif isinstance(no_grad_vars, (core.VarBase, core.eager.Tensor)):
+        no_grad_vars = [no_grad_vars]
+    elif isinstance(no_grad_vars, core.eager.Tensor):
         no_grad_vars = [no_grad_vars]
     elif isinstance(no_grad_vars, (list, tuple, set)):
         no_grad_vars = list(no_grad_vars)
         for var in no_grad_vars:
-            assert isinstance(
-                var, core.VarBase), "no_grad_vars can only contains Variable"
+            if core._in_eager_mode():
+                assert isinstance(
+                    var,
+                    core.eager.Tensor), "no_grad_vars can only contains Tensor"
+            else:
+                assert isinstance(
+                    var,
+                    core.VarBase), "no_grad_vars can only contains Variable"
     else:
-        raise AssertionError(
-            "no_grad_vars must be None, Variable or list/tuple/set of Variables")
+        if core._in_eager_mode():
+            raise AssertionError(
+                "no_grad_vars must be None, Tensor or list/tuple/set of Tensors")
+        else:
+            raise AssertionError(
+                "no_grad_vars must be None, Variable or list/tuple/set of Variables"
+            )
 
     assert isinstance(create_graph, bool), "create_graph must be True or False"
 
@@ -622,6 +649,11 @@ def check_in_out(in_out_list, name):
     assert isinstance(only_inputs, bool), "only_inputs must be True or False"
     assert only_inputs, "only_inputs=False is not supported yet"
 
+    if core._in_eager_mode():
+        return core.eager.run_partial_grad(
+            outputs, inputs, grad_outputs, retain_graph, create_graph,
+            only_inputs, allow_unused, no_grad_vars)
+
     place = core.Place()
     place.set_place(framework._current_expected_place())
     return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
@@ -686,13 +718,13 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
             y.shape     # [3L, 2L]
 
     """
-    support_type = (list, tuple, np.ndarray, core.VarBase, framework.Variable,
-                    core.Tensor, core.LoDTensor)
+    support_type = (list, tuple, np.ndarray, core.eager.Tensor, core.VarBase,
+                    framework.Variable, core.Tensor, core.LoDTensor)
     if not isinstance(value, support_type):
         raise TypeError(
             "The type of 'value' in fluid.dygraph.to_variable must be %s, but received %s."
             % (support_type, type(value)))
-    if isinstance(value, (core.VarBase, framework.Variable)):
+    if isinstance(value, (core.eager.Tensor, core.VarBase, framework.Variable)):
         return value
     elif isinstance(value, (core.Tensor, core.LoDTensor)):
         return core.VarBase(value)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
index e1df2324889b4..7733226cc09f2 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
@@ -297,10 +297,6 @@ def _is_var_shape(self, node):
         return False
 
     def _update_name_to_var_shape(self, node):
-        def replace_dot(name):
-            # replace all '.' into '_'
-            return name.replace('.', '_')
-
         assert isinstance(node, gast.Assign)
         target_node = node.targets[0]
         value_node = node.value
@@ -315,7 +311,6 @@ def replace_dot(name):
                     if value_node.id in self.name_to_var_shape:
                         # TODO(zhhsplendid): is context a problem for the result node of gast.parse?
                         static_shape_var_name = unique_name.generate(
-                            replace_dot(target_id) +
                             STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                         static_shape_var_node = gast.parse(
                             static_shape_var_name).body[0].value
@@ -337,7 +332,6 @@ def replace_dot(name):
                 if isinstance(value_node, gast.Attribute):
                     if self._is_var_shape(value_node):  # eg: x.shape
                         static_shape_var_name = unique_name.generate(
-                            replace_dot(target_id) +
                             STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                         static_shape_var_node = gast.parse(
                             static_shape_var_name).body[0].value
@@ -370,7 +364,6 @@ def replace_dot(name):
             if isinstance(value_node, gast.Name):
                 if value_node.id in self.name_to_var_shape:
                     static_shape_var_name = unique_name.generate(
-                        replace_dot(target_id) +
                         STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                     static_shape_var_node = gast.parse(
                         static_shape_var_name).body[0].value
@@ -387,7 +380,7 @@ def replace_dot(name):
                     self.name_to_var_shape[target_id] = static_shape_var_name
             elif self._is_var_shape(value_node):  # eg: x.shape or x.shape[0]
                 static_shape_var_name = unique_name.generate(
-                    replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX)
+                    STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                 static_shape_var_node = gast.parse(static_shape_var_name).body[
                     0].value
                 static_shape_value_node = copy.deepcopy(value_node)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index f58952d3036c5..a36164a277dec 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -30,6 +30,7 @@
 from paddle.fluid.layers.utils import _hash_with_id
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.framework import in_dygraph_mode
+from paddle import _C_ops
 
 __all__ = ['TranslatedLayer']
 
@@ -761,6 +762,21 @@ def _construct_params_and_buffers(model_path,
     return var_dict
 
 
+def _valid_vars(vars):
+    if vars:
+        return vars
+    if framework._in_eager_mode():
+        return [
+            core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
+                              core.VarDesc.VarType.RAW, False)
+        ]
+    else:
+        return [
+            core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
+                         core.VarDesc.VarType.RAW, False)
+        ]
+
+
 def _run_dygraph(instance, input, program_holder):
 
     # 1. prepare inputs, outputs, attrs
@@ -826,17 +842,12 @@ def _run_dygraph(instance, input, program_holder):
 
     # hold forward variables
     if framework._in_eager_mode():
-        tmp_scope_vec = core.eager.Tensor(
-            dtype=core.VarDesc.VarType.FP32,
-            dims=[],
-            name="program_out_scope",
-            type=core.VarDesc.VarType.STEP_SCOPES,
-            persistable=True)
+        tmp_scope_vec = [program_holder.scope]
     else:
         tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
                                      "program_out_scope",
                                      core.VarDesc.VarType.STEP_SCOPES, True)
-    tmp_scope_vec.value().set_scope(program_holder.scope)
+        tmp_scope_vec.value().set_scope(program_holder.scope)
 
     double_grad_vars = []
     for var_desc in program_holder.double_grad_descs:
@@ -852,41 +863,18 @@ def _run_dygraph(instance, input, program_holder):
                                var_desc.shape(),
                                var_desc.name(), var_desc.type(), False)
         double_grad_vars.append(var)
-    if len(double_grad_vars) == 0:
-        if framework._in_eager_mode():
-            double_grad_vars = [
-                core.eager.Tensor(
-                    value=[1],
-                    name='Fake_var',
-                    place=framework._current_expected_place())
-            ]
-        else:
-            double_grad_vars = [
-                core.VarBase(
-                    value=[1],
-                    name='Fake_var',
-                    place=framework._current_expected_place())
-            ]
 
     # 2. run program by op
     trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program
     end_op_index = program_holder.infer_program.block(0).op_size()
-    framework._dygraph_tracer().trace_op(
-        type='run_program',
-        inputs={'X': input_vars,
-                'Params': persistable_vars},
-        outputs={
-            'Out': output_vars,
-            'OutScope': tmp_scope_vec,
-            'DOut': double_grad_vars
-        },
-        attrs={
-            'global_block': trace_program.block(0),
-            'start_op_index': 0,
-            'end_op_index': end_op_index,
-            'is_test': instance._is_test,
-            'program_id': _hash_with_id(trace_program, instance)
-        })
+    attrs = ('global_block', trace_program.block(0), 'start_op_index', 0,
+             'end_op_index', end_op_index, 'is_test', instance._is_test,
+             'program_id', _hash_with_id(trace_program, instance))
+    _C_ops.run_program(
+        _valid_vars(input_vars),
+        _valid_vars(persistable_vars),
+        _valid_vars(output_vars), tmp_scope_vec,
+        _valid_vars(double_grad_vars), *attrs)
     # NOTE: [ why need set param's gradient type here ]
     # if user set sparse gradient mode, the param's gradient
     # will be SelectedRows, not LoDTensor. But tracer will just
@@ -914,8 +902,10 @@ def _run_dygraph(instance, input, program_holder):
 
 def drop_scope_if_no_grad(instance, scope_vec):
     tracer = framework._dygraph_tracer()
+    scope = scope_vec.value().get_scope() if isinstance(scope_vec, (
+        core.VarBase)) else scope_vec[0]
     if (not instance._is_test) and (not tracer._has_grad):
-        scope_vec.value().get_scope().drop_kids()
+        scope.drop_kids()
 
 
 def _run_static_graph(input, program_holder, trace_program):
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index b1865691b2475..1e1ce3ba7e491 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -821,7 +821,7 @@ def fun(inputs):
         for var in flatten(input_spec):
             if isinstance(var, paddle.static.InputSpec):
                 inner_input_spec.append(var)
-            elif isinstance(var, (core.VarBase, Variable)):
+            elif isinstance(var, (core.VarBase, core.eager.Tensor, Variable)):
                 inner_input_spec.append(
                     paddle.static.InputSpec.from_tensor(var))
             else:
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 53dbf1a66b27f..37db9f8fce77a 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -25,6 +25,7 @@
 import inspect
 
 import paddle
+import paddle.profiler as profiler
 
 from . import parallel_helper
 from .. import unique_name
@@ -760,7 +761,8 @@ def register_buffer(self, name, tensor, persistable=True):
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
             raise KeyError("attribute '{}' already exists.".format(name))
-        elif tensor is not None and not type(tensor) == core.VarBase:
+        elif tensor is not None and not (type(tensor) == core.VarBase or
+                                         type(tensor) == core.eager.Tensor):
             raise TypeError(
                 "The registered buffer should be a core.VarBase, but received {}.".
                 format(type(tensor).__name__))
@@ -904,7 +906,9 @@ def _dygraph_call_func(self, *inputs, **kwargs):
 
             self._built = True
 
-        outputs = self.forward(*inputs, **kwargs)
+        with profiler.RecordEvent(self.full_name(),
+                                  profiler.TracerEventType.Forward):
+            outputs = self.forward(*inputs, **kwargs)
 
         for forward_post_hook in self._forward_post_hooks.values():
             hook_result = forward_post_hook(self, inputs, outputs)
@@ -1154,7 +1158,8 @@ def _remove_if_exist(*dicts):
                 layers[name] = None
             else:
                 _buffers = self.__dict__.get('_buffers', None)
-                if type(value) == core.VarBase:
+                if type(value) == core.VarBase or \
+                    type(value) == core.eager.Tensor:
                     if _buffers is None:
                         raise ValueError(
                             "super(YourLayer, self).__init__() should be called first"
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 5bb1aef6d6e9b..b41e3e0b502b5 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -2986,6 +2986,12 @@ def __init__(self,
             is_bias=True)
 
     def forward(self, input):
+        if in_dygraph_mode():
+            attrs = ('epsilon', self._epsilon, 'groups', self._groups)
+            out, _, _ = _C_ops.group_norm(input, self.weight, self.bias, *attrs)
+
+            return dygraph_utils._append_activation_in_dygraph(out, self._act)
+
         inputs = {'X': input}
         if self.bias is not None:
             inputs['Bias'] = self.bias
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index a7dd938a1cfe7..d1efe0afeaad0 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -30,11 +30,22 @@
         "y": "Y",
         "out": "Out",
     },
+    # "elementwise_add": {
+    #     "final_op_name": "final_state_add",
+    #     "x": "X",
+    #     "y": "Y",
+    # },
     "trunc": {
         "final_op_name": "final_state_trunc",
         "x": "X",
         "out": "Out",
     },
+    "pool2d": {
+        "final_op_name": "final_state_pool2d",
+        "x": "X",
+        "kernel_size": "ksize",
+        "out": "Out",
+    },
     "abs": {
         "final_op_name": "final_state_abs",
         "x": "X",
@@ -52,6 +63,12 @@
         "axis1": "axis1",
         "axis2": "axis2",
         "out": "Out",
+    },
+    "one_hot": {
+        "final_op_name": "final_state_one_hot",
+        "x": "X",
+        "num_class": "depth",
+        "out": "Out",
     }
 }
 
@@ -252,7 +269,6 @@ def trace_op(self,
         if framework._in_eager_mode():
             # inputs : {"sum": [tensor], ...}
             # outputs : {"sum": [tensor], ...}
-
             if type in final_state_name_mapping.keys():
                 final_state_type = final_state_name_mapping[type][
                     "final_op_name"]
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 2b67a2029727f..2ca923f863487 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -28,6 +28,8 @@
 from .parallel import scale_loss
 from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
 import paddle.utils.deprecated as deprecated
+import paddle.profiler as profiler
+from paddle import _C_ops
 
 
 class TensorHookRemoveHelper(object):
@@ -198,8 +200,8 @@ def backward(self, grad_tensor=None, retain_graph=False):
         You can clear gradient by ``Tensor.clear_grad()`` .
 
         Args:
-            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, 
-            the initial gradient values of the current Tensor would be Tensor filled with 1.0; 
+            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None,
+            the initial gradient values of the current Tensor would be Tensor filled with 1.0;
             if `grad_tensor` is not None, it must have the same length as the current Tensor.
             Teh default value is None.
 
@@ -242,6 +244,9 @@ def backward(self, grad_tensor=None, retain_graph=False):
 
         """
         if framework.in_dygraph_mode():
+            record_event = profiler.RecordEvent(
+                "Gradient Backward", profiler.TracerEventType.Backward)
+            record_event.begin()
             if grad_tensor is not None:
                 if core._in_eager_mode():
                     assert isinstance(
@@ -277,6 +282,7 @@ def backward(self, grad_tensor=None, retain_graph=False):
                     core.dygraph_run_backward([self], [grad_tensor],
                                               retain_graph,
                                               framework._dygraph_tracer())
+            record_event.end()
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
@@ -475,7 +481,7 @@ def transform(t, device, dtype, blocking):
     def grad(self):
         """
         .. warning::
-          This API will return the tensor value of the gradient. If you want 
+          This API will return the tensor value of the gradient. If you want
           to get the numpy value of the gradient, you can use :code:`x.grad.numpy()`.
 
         Get the Gradient of Current Tensor.
@@ -514,7 +520,7 @@ def clear_grad(self):
 
     def item(self, *args):
         """
-        Convert element at specific position in Tensor into Python scalars. If the position is not specified, the Tensor must be a 
+        Convert element at specific position in Tensor into Python scalars. If the position is not specified, the Tensor must be a
         single-element Tensor.
 
         Args:
@@ -525,7 +531,7 @@ def item(self, *args):
 
         Raises:
             ValueError: If the Tensor has more than one element, there must be coordinates.
-        
+
         Examples:
             .. code-block:: python
 
@@ -587,7 +593,7 @@ def __str__(self):
                 import paddle
                 x = paddle.rand([2, 5])
                 print(x)
-                
+
                 # Tensor(shape=[2, 5], dtype=float32, place=CPUPlace,
                 #        [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436],
                 #         [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]])
@@ -610,7 +616,7 @@ def __deepcopy__(self, memo):
                 import copy
                 x = paddle.to_tensor(2.)
                 y = copy.deepcopy(x)
-                
+
                 print(x)
                 # Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=True,
                 #        [2.])
@@ -654,7 +660,7 @@ def __bool__(self):
     def __array__(self, dtype=None):
         """
         Returns a numpy array shows the value of current Tensor.
-        
+
         Returns:
             ndarray: The numpy value of current Tensor.
 
@@ -762,8 +768,11 @@ def is_combine_index(item):
             return _setitem_impl_(self, item, value)
 
         else:
-            # Call c++ func __setitem_varbase__ to speedup.
-            return self.__setitem_varbase__(item, value)
+            if core._in_eager_mode():
+                return self.__setitem_eager_tensor__(item, value)
+            else:
+                # Call c++ func __setitem_varbase__ to speedup.
+                return self.__setitem_varbase__(item, value)
 
     @framework.dygraph_only
     def _grad_ivar(self):
@@ -782,7 +791,7 @@ def _set_grad_ivar(self, value):
 
     @framework.dygraph_only
     def clone(self):
-        return _C_ops_.assign(self)
+        return _C_ops.assign(self)
 
     @framework.dygraph_only
     def value(self):
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index d0a94238a7aeb..fb787215d910e 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -316,7 +316,8 @@ def __impl__(*args, **kwargs):
 
 def _dygraph_only_(func):
     def __impl__(*args, **kwargs):
-        assert in_dygraph_mode(
+        assert (
+            in_dygraph_mode() or _in_eager_mode()
         ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__
         return func(*args, **kwargs)
 
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 676ee3e3c774e..148f4d95c64fd 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -266,9 +266,10 @@ def func(x, name=None):
                                      op_type)
         else:
             # abs exp square ops support dtype(int32, int64, float16, float32, float64)
-            check_variable_and_dtype(
-                x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
-                op_type)
+            check_variable_and_dtype(x, 'x', [
+                'int32', 'int64', 'float16', 'float32', 'float64', 'complex64',
+                'complex128'
+            ], op_type)
 
         helper = LayerHelper(op_type, **locals())
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fd7226c48661f..63a2aeabc2384 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5611,12 +5611,15 @@ def transpose(x, perm, name=None):
 
     """
     if in_dygraph_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_transpose(x, perm)
         out, _ = _C_ops.transpose2(x, 'axis', perm)
         return out
 
-    check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'transpose')
+    check_variable_and_dtype(x, 'x', [
+        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
+        'complex128'
+    ], 'transpose')
     check_type(perm, 'perm', (list, tuple), 'transpose')
     if isinstance(perm, tuple):
         perm = list(perm)
@@ -6299,7 +6302,14 @@ def get_attr_shape(list_shape):
                 if dim_size == -1:
                     assert unk_dim_idx == -1, (
                         "Only one dimension value of 'shape' in reshape can "
-                        "be -1. But received shape[%d] is also -1." % dim_idx)
+                        "be -1. But received shape[%d] is also -1.\n"
+                        "\n\t# N = x.shape()[2]\t\t# N is an int. "
+                        "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t"
+                        "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])"
+                        "\t# z.shape is [-1, -1, 4]\n\n"
+                        "    If your target shape in Reshape represents dynamic shape, "
+                        "please turn it into a Tensor under @to_static. See above example for details."
+                        % dim_idx)
                     unk_dim_idx = dim_idx
                 elif dim_size == 0:
                     assert dim_idx < len(x.shape), (
@@ -6401,10 +6411,10 @@ def squeeze(input, axes, name=None):
         return out
 
     helper = LayerHelper("squeeze", **locals())
-    check_variable_and_dtype(
-        input, 'input',
-        ['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'],
-        'squeeze')
+    check_variable_and_dtype(input, 'input', [
+        'float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64',
+        'complex64', 'complex128'
+    ], 'squeeze')
     check_type(axes, 'axis/axes', (list, tuple), 'squeeze')
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -6462,8 +6472,16 @@ def unsqueeze(input, axes, name=None):
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
     check_variable_and_dtype(input, 'input', [
-        'float16', 'float32', 'float64', 'bool', 'int8', 'int16', 'int32',
-        'int64'
+        'float16',
+        'float32',
+        'float64',
+        'bool',
+        'int8',
+        'int16',
+        'int32',
+        'int64',
+        'complex64',
+        'complex128',
     ], 'unsqueeze')
     helper = LayerHelper("unsqueeze2", **locals())
     inputs = {"X": input}
@@ -8543,6 +8561,8 @@ def gather_nd(input, index, name=None):
 
     """
     if in_dygraph_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_gather_nd(input, index)
         return _C_ops.gather_nd(input, index)
     check_variable_and_dtype(
         input, 'input',
@@ -8719,6 +8739,8 @@ def scatter_nd_add(ref, index, updates, name=None):
     """
 
     if in_dygraph_mode():
+        #if _in_eager_mode():
+        #return _C_ops.final_state_scatter_nd_add(ref, index, updates)
         op = getattr(_C_ops, 'scatter_nd_add')
         return op(ref, index, updates)
 
@@ -11167,8 +11189,8 @@ def slice(input, axes, starts, ends):
             ends_tensor.stop_gradient = True
             infer_flags = list(-1 for i in range(len(axes)))
 
-        return _C_ops.slice(input, starts_tensor, ends_tensor, 'axes', axes,
-                            'infer_flags', infer_flags, *attrs)
+        return _C_ops.slice(input, starts_tensor, ends_tensor, None, None,
+                            'axes', axes, 'infer_flags', infer_flags, *attrs)
 
     if not isinstance(starts, (list, tuple, Variable)):
         raise ValueError(
@@ -15285,6 +15307,8 @@ def gather_tree(ids, parents):
 
     """
     if in_dygraph_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_gather_tree(ids, parents)
         return _C_ops.gather_tree(ids, parents)
     else:
         helper = LayerHelper('gather_tree', **locals())
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index c63ad42288fd0..c5accd9ada8f7 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -632,7 +632,7 @@ def assign(input, output=None):
             dtype = VarDesc.VarType.FP32
         if dtype == VarDesc.VarType.BOOL:
             value_name = "bool_values"
-            values = [bool(v) for v in input.flat]
+            values = [int(v) for v in input.flat]
         elif dtype == VarDesc.VarType.FP32:
             value_name = "fp32_values"
             values = [float(v) for v in input.flat]
@@ -756,7 +756,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     check_shape(shape)
     check_dtype(dtype, 'dtype', [
         'bool', 'float16', 'float32', 'float64', 'uint8', 'int16', 'int32',
-        'int64'
+        'int64', 'complex64', 'complex128'
     ], 'fill_constant')
     check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant')
 
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 183a00bd70bdf..4d39d38853063 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -20,6 +20,8 @@
 import six
 import sys
 
+from paddle.utils.deprecated import deprecated
+
 __all__ = [
     'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
     'stop_profiler'
@@ -36,10 +38,16 @@
 ]
 
 
+@deprecated(
+    since="2.3.0",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained."
+)
 @signature_safe_contextmanager
 def cuda_profiler(output_file, output_mode=None, config=None):
     """
-    API cuda_profiler has been abandoned. If you have relevant requirements, you can use `paddle.utils.profiler.start_profiler` and `paddle.utils.profiler.stop_profiler`. 
+    API cuda_profiler has been abandoned. If you have relevant requirements, you can use `paddle.utils.profiler.start_profiler` and `paddle.utils.profiler.stop_profiler`.
     The relevant reference documents are as follows:
     <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/start_profiler_en.html#start-profiler>
     <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/stop_profiler_en.html#stop-profiler>
@@ -54,18 +62,18 @@ def cuda_profiler(output_file, output_mode=None, config=None):
 def npu_profiler(output_file, config=None):
     """
     The NPU profiler.
-    
+
     This fuctions is used to profile NPU program by NPU runtime application
     programming interface. The profiling result will be written into
-    `output_file`. The users can set set the NPU profiling config by `config` argument. 
-    
-    After getting the profiling result file, users can use 
-    `tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ 
+    `output_file`. The users can set set the NPU profiling config by `config` argument.
+
+    After getting the profiling result file, users can use
+    `tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_
     to load this output file to visualize results.
 
     Args:
         output_file (str) : The output file name, the result will be
-            written into this file. It should be absolute path. 
+            written into this file. It should be absolute path.
         config (list<str>, optional) : NPU profile config. For more details, please
             refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
 
@@ -109,6 +117,12 @@ def npu_profiler(output_file, config=None):
         core.npu_prof_finalize()
 
 
+@deprecated(
+    since="2.3.0",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained."
+)
 def reset_profiler():
     """
     Clear the previous time record. It works for
@@ -131,31 +145,38 @@ def reset_profiler():
     core.reset_profiler()
 
 
+@deprecated(
+    since="2.3.0",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained."
+)
 def start_profiler(state, tracer_option='Default'):
     """
     Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to profile, which is equal to the usage 
+    `fluid.profiler.stop_profiler` to profile, which is equal to the usage
     of `fluid.profiler.profiler` interface.
 
     Args:
         state (str) : The profiling state, which should be one of 'CPU', 'GPU'
             or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
-            both CPU and GPU; 'All' means profiling both CPU and GPU, and 
+            both CPU and GPU; 'All' means profiling both CPU and GPU, and
             generates timeline as well.
         tracer_option (str, optional) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
-            can control the profile level and print the different level profile result. `Default` option print 
-            the different Op type profiling result and the `OpDetail` option print the detail profiling 
-            result of different op types such as compute and data transform, `AllOpDetail` option 
+            can control the profile level and print the different level profile result. `Default` option print
+            the different Op type profiling result and the `OpDetail` option print the detail profiling
+            result of different op types such as compute and data transform, `AllOpDetail` option
             print the detail profiling result of different op name same as `OpDetail`.
 
     Raises:
-        ValueError: If `state` is not in ['CPU', 'GPU', 'All'] or `tracer_option` 
+        ValueError: If `state` is not in ['CPU', 'GPU', 'All'] or `tracer_option`
             is not in ['Default', 'OpDetail', 'AllOpDetail'].
 
     Examples:
 
         .. code-block:: python
 
+            # required: gpu
             import paddle.fluid as fluid
             import paddle.fluid.profiler as profiler
 
@@ -165,7 +186,7 @@ def start_profiler(state, tracer_option='Default'):
                     profiler.reset_profiler()
                 # except each iteration
             profiler.stop_profiler('total', '/tmp/profile')
-            
+
             profiler.start_profiler('GPU', "OpDetail")
             for iter in range(10):
                 if iter == 2:
@@ -198,14 +219,20 @@ def start_profiler(state, tracer_option='Default'):
     core.enable_profiler(prof_state)
 
 
+@deprecated(
+    since="2.3.0",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained."
+)
 def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
     """
     Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to profile, which is equal to the usage 
+    `fluid.profiler.stop_profiler` to profile, which is equal to the usage
     of `fluid.profiler.profiler` interface.
 
     Args:
-        sorted_key (str, optional) : The order of profiling results, which 
+        sorted_key (str, optional) : The order of profiling results, which
             should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
             Default is None, means the profiling results will be printed
             in the order of first end time of events.
@@ -214,7 +241,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
             The `max` means sorting by the maximum execution time.
             The `min` means sorting by the minimum execution time.
             The `ave` means sorting by the average execution time.
-            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
+            and write it into `profile_path`. The default profile_path is `/tmp/profile`.
         profile_path (str, optional) : If state == 'All', it will generate timeline,
 
     Raises:
@@ -225,6 +252,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
 
         .. code-block:: python
 
+            # required: gpu
             import paddle.fluid as fluid
             import paddle.fluid.profiler as profiler
 
@@ -254,6 +282,12 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
     core.disable_profiler(key_map[sorted_key], profile_path)
 
 
+@deprecated(
+    since="2.3.0",
+    update_to="paddle.profiler.Profiler",
+    level=1,
+    reason="Please use new profiler tool, this profiler tool is no longer maintained."
+)
 @signature_safe_contextmanager
 def profiler(state,
              sorted_key=None,
@@ -265,9 +299,9 @@ def profiler(state,
     Args:
         state (str) : The profiling state, which should be one of 'CPU', 'GPU'
             or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
-            both CPU and GPU; 'All' means profiling both CPU and GPU, and 
+            both CPU and GPU; 'All' means profiling both CPU and GPU, and
             generates timeline as well.
-        sorted_key (str, optional) : The order of profiling results, which 
+        sorted_key (str, optional) : The order of profiling results, which
             should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
             Default is None, means the profiling results will be printed
             in the order of first end time of events.
@@ -277,11 +311,11 @@ def profiler(state,
             The `min` means sorting by the minimum execution time.
             The `ave` means sorting by the average execution time.
         profile_path (str, optional) : If state == 'All', it will generate timeline,
-            and write it into `profile_path`. The default profile_path is `/tmp/profile`. 
+            and write it into `profile_path`. The default profile_path is `/tmp/profile`.
         tracer_option (str, optional) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
-            can control the profile level and print the different level profile result. `Default` option print 
-            the different Op type profiling result and the `OpDetail` option print the detail profiling 
-            result of different op types such as compute and data transform, `AllOpDetail` option 
+            can control the profile level and print the different level profile result. `Default` option print
+            the different Op type profiling result and the `OpDetail` option print the detail profiling
+            result of different op types such as compute and data transform, `AllOpDetail` option
             print the detail profiling result of different op name same as `OpDetail`.
 
     Raises:
@@ -319,7 +353,7 @@ def profiler(state,
 
             #### Examples Results ####
             #### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' ####
-            # The only difference in 5 sorted_key results is the following sentence: 
+            # The only difference in 5 sorted_key results is the following sentence:
             # "Sorted by number of xxx in descending order in the same thread."
             # The reason is that in this example, above 5 columns are already sorted.
             ------------------------->     Profiling Report     <-------------------------
@@ -339,7 +373,7 @@ def profiler(state,
 
             #### 2) sorted_key = None  ####
             # Since the profiling results are printed in the order of first end time of Ops,
-            # the printed order is feed->conv2d->elementwise_add 
+            # the printed order is feed->conv2d->elementwise_add
             ------------------------->     Profiling Report     <-------------------------
 
             Place: CPU
@@ -366,7 +400,7 @@ def _nvprof_range(iter_id, start, end, exit_after_prof=True):
     Examples:
 
         .. code-block:: python
-            
+
             model = Model()
             for i in range(max_iter):
                 paddle.fluid.profiler._nvprof_range(i, 10, 20):
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 9348b0b50a1c0..c45045509201d 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -886,6 +886,7 @@ def test_distribute_fpn_proposals(self):
                 refer_level=4,
                 refer_scale=224,
                 rois_num=rois_num_dy)
+            print(type(multi_rois_dy))
             output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy
             output_dy_np = []
             for output in output_dy:
@@ -973,4 +974,5 @@ def generate_input(pb_type, pbv_type, loc_type, score_type, name):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b05f16a060684..2acf530eea3fb 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -949,6 +949,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE)
 endif()
 
 # setting timeout value as 15S
+set_tests_properties(test_run PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200)
@@ -959,6 +960,7 @@ set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_profiler PROPERTIES TIMEOUT 120)
+set_tests_properties(test_inplace_eager_fluid PROPERTIES TIMEOUT 120)
 set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120)
@@ -1117,9 +1119,9 @@ set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 150)
     set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 150)
     set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
     set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
new file mode 100644
index 0000000000000..a2b499a9e01c3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
+from paddle.fluid.dygraph.layers import Layer, _convert_camel_to_snake
+
+
+class MyOwnLayer(Layer):
+    def __init__(self):
+        super(MyOwnLayer, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+static_tensor = None
+static_tensor_mask = None
+
+
+def my_own_pruning(tensor, m, n, mask_algo, param_name):
+    global static_tensor
+    global static_tensor_mask
+    if static_tensor is None:
+        static_tensor = np.random.rand(*tensor.shape).astype(np.float32)
+    if static_tensor_mask is None:
+        static_tensor_mask = np.random.rand(*tensor.shape).astype(np.float32)
+    return static_tensor, static_tensor_mask
+
+
+class TestASPAddSupportedLayer(unittest.TestCase):
+    def test_add_supported_layer_via_name(self):
+        sparsity.add_supported_layer("test_supported_1")
+        sparsity.add_supported_layer("test_supported_2", my_own_pruning)
+        sparsity.add_supported_layer(MyOwnLayer)
+        my_own_layer_name = _convert_camel_to_snake(MyOwnLayer.__name__)
+
+        self.assertTrue(
+            "test_supported_1" in supported_layers_and_prune_func_map)
+        self.assertTrue(
+            "test_supported_2" in supported_layers_and_prune_func_map)
+        self.assertTrue(
+            "test_supported_2" in supported_layers_and_prune_func_map)
+        self.assertTrue(supported_layers_and_prune_func_map["test_supported_2"]
+                        == my_own_pruning)
+        self.assertTrue(
+            my_own_layer_name in supported_layers_and_prune_func_map)
+
+
+class TestASPStaticCustomerizedPruneFunc(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        self.customer_prefix = "customer_layer"
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden,
+                                     size=32,
+                                     act='relu',
+                                     name=self.customer_prefix)
+            hidden = fluid.layers.fc(input=hidden,
+                                     size=32,
+                                     act='relu',
+                                     name=self.customer_prefix)
+            hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, self.predict = build_model()
+            self.supported_layer_count_ref = 5
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+        self.exe = fluid.Executor(self.place)
+
+        sparsity.add_supported_layer(self.customer_prefix, my_own_pruning)
+
+    def test_inference_pruning(self):
+        self.exe.run(self.startup_program)
+
+        sparsity.prune_model(
+            self.main_program, mask_algo="mask_1d", with_mask=False)
+
+        supported_layer_count = 0
+        for param in self.main_program.global_block().all_parameters():
+            mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
+            ))
+            if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
+                                                          param.name):
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+    def test_training_pruning(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=self.predict, label=self.label))
+            optimizer = sparsity.decorate(
+                fluid.optimizer.SGD(learning_rate=0.01))
+            optimizer.minimize(loss, self.startup_program)
+
+        self.exe.run(self.startup_program)
+
+        sparsity.prune_model(
+            self.main_program, mask_algo="mask_1d", with_mask=True)
+
+        supported_layer_count = 0
+        for param in self.main_program.global_block().all_parameters():
+            mat = np.array(fluid.global_scope().find_var(param.name).get_tensor(
+            ))
+            if sparsity.asp.ASPHelper._is_supported_layer(self.main_program,
+                                                          param.name):
+                mat_mask = np.array(fluid.global_scope().find_var(
+                    sparsity.asp.ASPHelper._get_mask_name(param.name))
+                                    .get_tensor())
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                    self.assertLessEqual(
+                        np.sum(mat_mask.flatten() - static_tensor_mask.flatten(
+                        )), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat_mask.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 1f7ae53acdf45..a730d21afa579 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -9,6 +9,12 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
     py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
     set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
+    py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS})
     py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS})
     set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+
+    py_test_modules(test_tunable_variable MODULES test_tunable_variable ENVS ${dist_ENVS})
+    py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS})
+    py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS})
+    py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py
new file mode 100644
index 0000000000000..ab704a6a25714
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from paddle.distributed.auto_parallel.tuner import recorder as rd
+
+
+class TestRecorder(unittest.TestCase):
+    def test_register(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric")
+        self.assertEqual(set(recorder.records.keys()), {"metric"})
+        self.assertEqual(recorder.records["metric"].direction, "min")
+
+    def test_exists(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric", direction="max")
+        self.assertTrue(recorder.exists("metric"))
+
+    def test_update(self):
+        recorder = rd.MetricsRecorder()
+        recorder.update("metric", 4, 1000)
+        self.assertEqual(recorder.records["metric"].direction, "min")
+        self.assertEqual(
+            recorder.get_records("metric"), [rd.MetricRecord(4, 1000)])
+
+    def test_get_records(self):
+        recorder = rd.MetricsRecorder()
+        recorder.update("metric", 1, step=0)
+        recorder.update("metric", 2, step=1)
+        recorder.update("metric", 3, step=2)
+        recorder.update("metric", 4, step=3)
+        self.assertEqual(
+            recorder.get_records("metric"), [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ])
+
+    def test_set_records(self):
+        recorder = rd.MetricsRecorder()
+        recorder.set_records(
+            "metric",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(
+            recorder.get_records("metric"), [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ])
+
+    def test_get_best_value(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric_min", "min")
+        recorder.register("metric_max", "max")
+
+        recorder.set_records(
+            "metric_min",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_value("metric_min"), 1)
+
+        recorder.set_records(
+            "metric_max",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_value("metric_max"), 4)
+
+    def test_get_best_step(self):
+        recorder = rd.MetricsRecorder()
+
+        recorder.register("metric_min", "min")
+        recorder.set_records(
+            "metric_min",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_step("metric_min"), 0)
+
+        recorder.register("metric_max", "max")
+        recorder.set_records(
+            "metric_max",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        self.assertEqual(recorder.get_best_step("metric_max"), 3)
+
+    def test_get_statistics(self):
+        recorder = rd.MetricsRecorder()
+        records = [rd.MetricRecord(np.random.random(), i) for i in range(14)]
+        recorder.set_records("metric", records)
+        stats = recorder.get_statistics("metric")
+        records = [r.value for r in records]
+        self.assertEqual(stats["min"], np.min(records))
+        self.assertEqual(stats["max"], np.max(records))
+        self.assertEqual(stats["mean"], np.mean(records))
+        self.assertEqual(stats["median"], np.median(records))
+        self.assertEqual(stats["var"], np.var(records))
+        self.assertEqual(stats["std"], np.std(records))
+
+    def test_serialization(self):
+        recorder = rd.MetricsRecorder()
+        recorder.register("metric")
+        recorder.set_records(
+            "metric",
+            [
+                rd.MetricRecord(1, 0),
+                rd.MetricRecord(2, 1),
+                rd.MetricRecord(3, 2),
+                rd.MetricRecord(4, 3),
+            ], )
+        print(recorder.get_state())
+        new_recorder = rd.MetricsRecorder.from_state(recorder.get_state())
+        self.assertEqual(new_recorder.records.keys(), recorder.records.keys())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py
new file mode 100644
index 0000000000000..fc52d1c394eff
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.tuner import tunable_space as ts
+from paddle.distributed.auto_parallel.tuner import trial as tr
+
+
+class TestTiral(unittest.TestCase):
+    def test_trial(self):
+        space = ts.TunableSpace()
+        space.choice("choice", [0, 1, 2, 3], default=2)
+        trial = tr.Trial(space, trial_id="trial-1")
+        trial.recorder.register("latency", direction="min")
+        trial.recorder.update("latency", 0.1, step=0)
+        trial.recorder.update("latency", 0.2, step=1)
+        trial.best_step = 0
+
+        self.assertEqual(trial.id, "trial-1")
+        self.assertEqual(trial.space.get_value("choice"), 2)
+        self.assertEqual(trial.best_step, 0)
+        self.assertEqual(trial.status, "RUNNING")
+
+    def test_serialization(self):
+        space = ts.TunableSpace()
+        space.int_range("int_range", start=1, stop=4, default=2)
+        trial = tr.Trial(space, trial_id="trial-2", status="COMPLETED")
+        trial.recorder.register("latency", direction="min")
+        trial.recorder.update("latency", 0.1, step=0)
+        trial.recorder.update("latency", 0.2, step=1)
+        trial.best_step = 0
+
+        new_trial = tr.Trial.from_state(trial.get_state())
+        self.assertEqual(new_trial.id, "trial-2")
+        self.assertEqual(new_trial.space.get_value("int_range"), 2)
+        self.assertEqual(new_trial.best_step, 0)
+        self.assertEqual(new_trial.status, "COMPLETED")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
new file mode 100644
index 0000000000000..cb7104f9ef641
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.tuner import tunable_space as ts
+
+
+class TestTunableSpace(unittest.TestCase):
+    def test_fixed(self):
+        space = ts.TunableSpace()
+        fixed = space.fixed("fixed", default=4)
+        self.assertEqual(space.values["fixed"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["fixed"].name, "fixed")
+
+        space.values["fixed"] = 2
+        self.assertEqual(space.get_value("fixed"), 2)
+        self.assertEqual(space.values, {"fixed": 2})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["fixed"].name, "fixed")
+
+    def test_boolean(self):
+        space = ts.TunableSpace()
+        boolean = space.boolean("boolean")
+        self.assertEqual(space.values["boolean"], False)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["boolean"].name, "boolean")
+
+        space.values["boolean"] = True
+        self.assertEqual(space.get_value("boolean"), True)
+        self.assertEqual(space.values, {"boolean": True})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["boolean"].name, "boolean")
+
+    def test_choice(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(space.values["choice"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+        space.values["choice"] = 2
+        self.assertEqual(space.get_value("choice"), 2)
+        self.assertEqual(space.values, {"choice": 2})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+    def test_int_range(self):
+        space = ts.TunableSpace()
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+        self.assertEqual(space.values["int_range"], 2)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+        space.values["int_range"] = 3
+        self.assertEqual(space.get_value("int_range"), 3)
+        self.assertEqual(space.values, {"int_range": 3})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+    def test_float_range(self):
+        space = ts.TunableSpace()
+        float_range = space.float_range(
+            "float_range", start=0.4, stop=4.4, default=2.0)
+        self.assertEqual(space.values["float_range"], 2.0)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["float_range"].name, "float_range")
+
+        space.values["float_range"] = 3.0
+        self.assertEqual(space.get_value("float_range"), 3.0)
+        self.assertEqual(space.values, {"float_range": 3.0})
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["float_range"].name, "float_range")
+
+    def test_varibles(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(space.values["choice"], 4)
+        self.assertEqual(len(space.variables), 1)
+        self.assertEqual(space.variables["choice"].name, "choice")
+
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+        self.assertEqual(space.values["int_range"], 2)
+        self.assertEqual(len(space.variables), 2)
+        self.assertEqual(space.variables["int_range"].name, "int_range")
+
+    def test_not_populated_variable(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=2)
+        self.assertEqual(choice, 2)
+
+    def test_populated_variable(self):
+        space = ts.TunableSpace()
+        space.values["choice"] = 2
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        self.assertEqual(choice, 2)
+
+        space["choice"] = 3
+        self.assertNotEqual(space.values["choice"], 2)
+        self.assertEqual(space.values["choice"], 3)
+
+    def test_state(self):
+        space = ts.TunableSpace()
+        choice = space.choice("choice", [1, 2, 3, 4], default=4)
+        int_range = space.int_range("int_range", start=1, stop=4, default=2)
+
+        new_space = space.from_state(space.get_state())
+        self.assertEqual(new_space.get_value("choice"), 4)
+        self.assertEqual(new_space.get_value("int_range"), 2)
+        self.assertEqual(len(new_space.variables), 2)
+        self.assertEqual(len(new_space.values), 2)
+
+        self.assertEqual(new_space.variables["choice"].name, "choice")
+        self.assertEqual(new_space.variables["choice"].default, 4)
+        self.assertEqual(new_space.variables["choice"].values, [1, 2, 3, 4])
+
+        self.assertEqual(new_space.variables["int_range"].name, "int_range")
+        self.assertEqual(new_space.variables["int_range"].default, 2)
+        self.assertEqual(new_space.variables["int_range"].start, 1)
+        self.assertEqual(new_space.variables["int_range"].stop, 4)
+        self.assertEqual(new_space.variables["int_range"].step, 1)
+        self.assertEqual(new_space.variables["int_range"].endpoint, False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py
new file mode 100644
index 0000000000000..c36fca7a9d09a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.tuner import tunable_variable as tv
+
+
+class TestTunableVariable(unittest.TestCase):
+    def test_fixed(self):
+        fixed = tv.Fixed("fixed", True)
+        fixed = tv.Fixed.from_state(fixed.get_state())
+        self.assertEqual(fixed.default, True)
+        self.assertEqual(fixed.random(), True)
+
+        fixed = tv.Fixed("fixed", 1)
+        fixed = tv.Fixed.from_state(fixed.get_state())
+        self.assertEqual(fixed.default, 1)
+        self.assertEqual(fixed.random(), 1)
+
+    def test_boolean(self):
+        boolean = tv.Boolean("bool")
+        boolean = tv.Boolean.from_state(boolean.get_state())
+        self.assertEqual(boolean.default, False)
+        self.assertIn(boolean.random(), [True, False])
+        self.assertIn(boolean.random(1234), [True, False])
+
+        boolean = tv.Boolean("bool", True)
+        boolean = tv.Boolean.from_state(boolean.get_state())
+        self.assertEqual(boolean.default, True)
+        self.assertIn(boolean.random(), [True, False])
+        self.assertIn(boolean.random(1234), [True, False])
+
+    def test_choice(self):
+        choice = tv.Choice("choice", [1, 2, 3, 4])
+        choice = tv.Choice.from_state(choice.get_state())
+        self.assertEqual(choice.default, 1)
+        self.assertIn(choice.random(), [1, 2, 3, 4])
+        self.assertIn(choice.random(1234), [1, 2, 3, 4])
+
+        choice = tv.Choice("choice", [1, 2, 3, 4], default=2)
+        choice = tv.Choice.from_state(choice.get_state())
+        self.assertEqual(choice.default, 2)
+        self.assertIn(choice.random(), [1, 2, 3, 4])
+        self.assertIn(choice.random(1234), [1, 2, 3, 4])
+
+    def test_int_range(self):
+        int_range = tv.IntRange("int_range", start=1, stop=4, default=2)
+        int_range = tv.IntRange.from_state(int_range.get_state())
+        self.assertEqual(int_range.default, 2)
+        self.assertIn(int_range.random(), [1, 2, 3, 4])
+        self.assertIn(int_range.random(1234), [1, 2, 3, 4])
+        self.assertNotEqual(int_range.default, 4)
+
+        int_range = tv.IntRange(
+            "int_range", start=1, stop=8, step=2, default=3, endpoint=True)
+        int_range = tv.IntRange.from_state(int_range.get_state())
+        self.assertEqual(int_range.default, 3)
+        self.assertIn(int_range.random(), [1, 3, 5, 7])
+        self.assertIn(int_range.random(1234), [1, 3, 5, 7])
+        self.assertNotEqual(int_range.default, 2)
+
+    def test_float_range(self):
+        float_range = tv.FloatRange(
+            "float_range", start=0.4, stop=4.4, default=2.0)
+        float_range = tv.FloatRange.from_state(float_range.get_state())
+        self.assertEqual(float_range.default, 2.0)
+        self.assertGreater(float_range.random(), 0.4)
+        self.assertLess(float_range.random(1234), 4.4)
+        self.assertNotAlmostEqual(float_range.random(), 1)
+        self.assertNotAlmostEqual(float_range.random(), 4.4)
+
+        float_range = tv.FloatRange(
+            "float_range",
+            start=0.4,
+            stop=8.4,
+            step=2.0,
+            default=3.0,
+            endpoint=True)
+        float_range = tv.FloatRange.from_state(float_range.get_state())
+        self.assertEqual(float_range.default, 3.0)
+        self.assertGreater(float_range.random(), 0.4)
+        self.assertLessEqual(float_range.random(1234), 8.4)
+        self.assertNotAlmostEqual(float_range.random(), 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
new file mode 100644
index 0000000000000..1179fd9a9f088
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.distributed.auto_parallel as auto
+
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.completion import Completer
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.utils import make_data_unshard
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
+from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = [[0, 1], [2, 3]]
+
+
+def get_random_inputs_and_labels(input_shape, label_shape):
+    input = np.random.random(size=input_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('float32')
+    return input, label
+
+
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_size):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, sequence_len, hidden_size],
+                [batch_size, sequence_len, 1])
+            yield batch_input, batch_label
+
+    return __reader__
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(
+            mean=0.0, std=initializer_range)
+
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+        out = self.norm(input)
+        auto.shard_tensor(
+            self.linear0.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, 0]
+            })
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        auto.shard_tensor(
+            self.linear1.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[1],
+                "dims_mapping": [0, -1]
+            })
+        out = self.linear1(out)
+
+        return out
+
+
+def loop_cond(i, loop_len, input_array):
+    return i < loop_len
+
+
+def loop_body(i, loop_len, input_array):
+    pre_input = paddle.tensor.array_read(array=input_array, i=i)
+    mlp_while0 = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+
+    mlp_while1 = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02)
+
+    output = mlp_while0(pre_input)
+    cur_pred = mlp_while1(output)
+    # 更新循环条件
+    i = paddle.increment(x=i, value=1)
+    paddle.tensor.array_write(cur_pred, array=input_array, i=i)
+    return i, loop_len, input_array
+
+
+def get_program():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+
+    train_program = static.Program()
+    start_program = static.Program()
+    with static.program_guard(train_program, start_program):
+
+        # 循环计数器
+        i = paddle.full(shape=[1], fill_value=0, dtype='int64')
+        # 循环次数
+        loop_len = paddle.full(shape=[1], fill_value=epoch_num, dtype='int64')
+
+        # input
+        input = static.data(
+            name="input",
+            shape=[batch_size, sequence_len, hidden_size],
+            dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        data_holder = [input, label]
+        # dataloader
+        dataloader = paddle.io.DataLoader.from_generator(
+            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
+        dataloader.set_batch_generator(
+            batch_generator_creator(), places=paddle.static.cuda_places())
+        # data dist_attr
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, -1, -1]
+            })
+        auto.shard_tensor(
+            label,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, -1, -1]
+            })
+
+        mlp_start = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_start(input)
+
+        input_array = paddle.tensor.array_write(pred, i)
+        i, loop_len, input_array = static.nn.while_loop(
+            cond=loop_cond,
+            body=loop_body,
+            loop_vars=[i, loop_len, input_array])
+        end_pred = paddle.tensor.array_read(array=input_array, i=i)
+
+        mlp_end = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_end(end_pred)
+
+        error_cost = paddle.nn.functional.square_error_cost(pred, label)
+        loss = paddle.mean(error_cost)
+
+    return train_program, start_program, dataloader, i, loss
+
+
+class TestMLP(unittest.TestCase):
+    def test_completer(self):
+        train_program, start_program, dataloader, i, loss = get_program()
+        dist_context = DistributedContext()
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
+        # print_program_with_dist_attr(complete_train_program, dist_context)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
index cc2e14d6d6c2e..341ec852c5219 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
@@ -50,3 +50,7 @@ class TestExponentialFamilyException(unittest.TestCase):
     def test_entropy_exception(self):
         with self.assertRaises(NotImplementedError):
             paddle.distribution.ExponentialFamily.entropy(self.dist)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_kl.py b/python/paddle/fluid/tests/unittests/distribution/test_kl.py
index a1413722446e2..55358380c8b23 100644
--- a/python/paddle/fluid/tests/unittests/distribution/test_kl.py
+++ b/python/paddle/fluid/tests/unittests/distribution/test_kl.py
@@ -112,3 +112,7 @@ def test_kl_expfamily_expfamily(self):
             kl._kl_expfamily_expfamily(self.p, self.q),
             rtol=config.RTOL.get(config.DEFAULT_DTYPE),
             atol=config.ATOL.get(config.DEFAULT_DTYPE))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index cac64c7391351..2b8307461b8f5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -27,6 +27,7 @@
 from paddle.fluid.optimizer import AdamOptimizer
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
+from paddle.fluid.framework import _test_eager_guard
 
 from predictor_utils import PredictorTools
 
@@ -155,6 +156,13 @@ def test_mnist_to_static(self):
             np.allclose(dygraph_loss, static_loss),
             msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
                                                             static_loss))
+        with _test_eager_guard():
+            dygraph_loss = self.train_dygraph()
+            static_loss = self.train_static()
+            self.assertTrue(
+                np.allclose(dygraph_loss, static_loss),
+                msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
+                                                                static_loss))
 
     def test_mnist_declarative_cpu_vs_mkldnn(self):
         dygraph_loss_cpu = self.train_dygraph()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index 06d69daa75d1c..d05be03bbfb19 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -223,6 +223,12 @@ def dyfunc_len_paddle_shape():
         print(x)
 
 
+def dyfunc_dict_assign_shape():
+    x = paddle.to_tensor([1, 2])
+    a = {}
+    a['shape'] = x.shape[0]
+
+
 # 1. Basic tests without control flow
 class TestTensorShapeBasic(unittest.TestCase):
     def setUp(self):
@@ -592,6 +598,8 @@ class TestPaddleShape(unittest.TestCase):
     def test_paddle_shape(self):
         func = paddle.jit.to_static(dyfunc_len_paddle_shape)
         self.assertEqual('paddle.shape(x)' in func.code, True)
+        func = paddle.jit.to_static(dyfunc_dict_assign_shape)
+        self.assertEqual("__static_convert_var_shape_suffix" in func.code, True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/init_process_group.py b/python/paddle/fluid/tests/unittests/init_process_group.py
new file mode 100644
index 0000000000000..90926b1a021d3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/init_process_group.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+import datetime
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        self.config()
+
+    def config(self):
+        pass
+
+    def test_init_process_group(self):
+        paddle.distributed.collective._init_parallel_env()
+        paddle.distributed.collective._new_group()
+        with self.assertRaises(ValueError):
+            paddle.distributed.collective._new_group(
+                backend="gloo", group_name="_default_pg")
+        print("test ok\n")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
index 4f17c90de72ad..35f4ca17d5eba 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
@@ -98,5 +98,117 @@ def test(self):
         self.check(output_dict)
 
 
+class TestAssignFp32Value(TestBase):
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+        data = np.random.uniform(size=[2, 3, 1])
+        self.assign_fp32 = data.astype(np.float32)
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                assign = paddle.assign(self.assign_fp32)
+                out = paddle.fluid.layers.elementwise_add(x, assign)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+
+class TestAssignBoolValue(TestBase):
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+        data = np.random.choice([True, False], size=(2, 3, 1))
+        self.assign_bool = data.astype(np.bool)
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+                x = paddle.less_than(x, x)
+                assign = paddle.assign(self.assign_bool)
+                out = paddle.logical_and(x, assign)
+                out = paddle.cast(out, 'float32')
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
index 05a37dcb3d514..934ad10142827 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
@@ -22,33 +22,18 @@
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
-class TestBase(IPUOpTest):
+class TestGreaterThan(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_data_feed()
-        self.set_feed_attr()
-        self.set_op_attrs()
+        self.set_test_op()
 
     @property
     def fp16_enabled(self):
         return True
 
-    def set_data_feed(self):
-        x = np.random.randn(3, 4, 5)
-        y = np.random.randn(3, 4, 5)
-        self.feed_fp32 = {
-            "x": x.astype(np.float32),
-            "y": y.astype(np.float32),
-        }
-        self.feed_fp16 = {
-            "x": x.astype(np.float16),
-            "y": y.astype(np.float16),
-        }
-
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
-        self.feed_list = list(self.feed_fp32.keys())
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.greater_than
 
     def set_op_attrs(self):
         self.attrs = {}
@@ -71,7 +56,7 @@ def _test_base(self, exec_mode):
                     shape=self.feed_shape[1],
                     dtype='float32')
 
-                out = paddle.fluid.layers.greater_than(x, y, **self.attrs)
+                out = self.op(x, y, **self.attrs)
 
                 fetch_list = [out.name]
 
@@ -102,7 +87,7 @@ def _test_base(self, exec_mode):
             result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test(self):
+    def run_test_base(self):
         output_dict = {}
         for mode in ExecutionMode:
             if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
@@ -111,29 +96,73 @@ def test(self):
 
         self.check(output_dict)
 
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_data_feed0(self):
+        x = np.random.randn(3, 4, 5)
+        y = np.random.randn(3, 4, 5)
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+        }
+        self.set_feed_attr()
 
-class TestCase1(TestBase):
-    def set_data_feed(self):
+    def set_data_feed1(self):
         x = np.ones([1, 10])
         y = np.ones([10])
         self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
         self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+        self.set_feed_attr()
 
-
-class TestCase2(TestBase):
-    def set_data_feed(self):
+    def set_data_feed2(self):
         x = np.ones([1, 10])
         y = np.zeros([1, 10])
         self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
         self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+        self.set_feed_attr()
 
-
-class TestCase3(TestBase):
-    def set_data_feed(self):
+    def set_data_feed3(self):
         x = np.zeros([1, 10])
         y = np.ones([1, 10])
         self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
         self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+        self.set_feed_attr()
+
+    def test_case0(self):
+        self.set_data_feed0()
+        self.set_op_attrs()
+        self.run_test_base()
+
+    def test_case1(self):
+        self.set_data_feed1()
+        self.set_op_attrs()
+        self.run_test_base()
+
+    def test_case2(self):
+        self.set_data_feed2()
+        self.set_op_attrs()
+        self.run_test_base()
+
+    def test_case3(self):
+        self.set_data_feed3()
+        self.set_op_attrs()
+        self.run_test_base()
+
+
+class TestLessThan(TestGreaterThan):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.less_than
+
+
+class TestEqual(TestGreaterThan):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.equal
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
index 026b19eccf187..76ab1a2c3f311 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
+import unittest
 
 import numpy as np
-import unittest
 import paddle
+import paddle.static
 
 paddle.enable_static()
 
@@ -26,30 +26,31 @@
 class TestIpuShard(unittest.TestCase):
     def _test(self):
         # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_index : no
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog):
+            a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+            b = a + 2  # scale : scale * x + bias, ipu_index : no
+
+            with paddle.static.ipu_shard_guard(index=1):
+                c = b + 1  # scale, ipu_index : 1
+                with paddle.static.ipu_shard_guard(index=2):
+                    d = c * 2  # scale, ipu_index : 2
+                with paddle.static.ipu_shard_guard(index=3):
+                    e = d + 3  # scale, ipu_index : 3
+                    with paddle.static.ipu_shard_guard(index=1):
+                        e = e + 3  # scale, ipu_index : 1
+                        with paddle.static.ipu_shard_guard(index=2):
+                            e = e + 3  # scale, ipu_index : 2
+
+            with paddle.static.ipu_shard_guard(index=1):
+                f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
 
-        with paddle.static.ipu_shard_guard(index=1):
-            c = b + 1  # scale, ipu_index : 1
             with paddle.static.ipu_shard_guard(index=2):
-                d = c * 2  # scale, ipu_index : 2
-            with paddle.static.ipu_shard_guard(index=3):
-                e = d + 3  # scale, ipu_index : 3
-                with paddle.static.ipu_shard_guard(index=1):
-                    e = e + 3  # scale, ipu_index : 1
-                    with paddle.static.ipu_shard_guard(index=2):
-                        e = e + 3  # scale, ipu_index : 2
-
-        with paddle.static.ipu_shard_guard(index=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
+                g = f - 1  # scale, ipu_index : 2
 
-        with paddle.static.ipu_shard_guard(index=2):
-            g = f - 1  # scale, ipu_index : 2
-
-        h = g + 1  # scale, ipu_index : no
+            h = g + 1  # scale, ipu_index : no
 
         ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
         for op in main_prog.global_block().ops:
             if op.desc.has_attr("ipu_index"):
                 ipu_index_list.append(op.desc.attr("ipu_index"))
@@ -69,30 +70,31 @@ def test_ipu_shard(self):
 class TestIpuPipeline(unittest.TestCase):
     def _test(self):
         # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_stage : no
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog):
+            a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+            b = a + 2  # scale : scale * x + bias, ipu_stage : no
+
+            with paddle.static.ipu_shard_guard(stage=1):
+                c = b + 1  # scale, ipu_stage : 1
+                with paddle.static.ipu_shard_guard(stage=2):
+                    d = c * 2  # scale, ipu_stage : 2
+                with paddle.static.ipu_shard_guard(stage=3):
+                    e = d + 3  # scale, ipu_stage : 3
+                    with paddle.static.ipu_shard_guard(stage=1):
+                        e = e + 3  # scale, ipu_stage : 1
+                        with paddle.static.ipu_shard_guard(stage=2):
+                            e = e + 3  # scale, ipu_stage : 2
+
+            with paddle.static.ipu_shard_guard(stage=1):
+                f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
 
-        with paddle.static.ipu_shard_guard(stage=1):
-            c = b + 1  # scale, ipu_stage : 1
             with paddle.static.ipu_shard_guard(stage=2):
-                d = c * 2  # scale, ipu_stage : 2
-            with paddle.static.ipu_shard_guard(stage=3):
-                e = d + 3  # scale, ipu_stage : 3
-                with paddle.static.ipu_shard_guard(stage=1):
-                    e = e + 3  # scale, ipu_stage : 1
-                    with paddle.static.ipu_shard_guard(stage=2):
-                        e = e + 3  # scale, ipu_stage : 2
-
-        with paddle.static.ipu_shard_guard(stage=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
-
-        with paddle.static.ipu_shard_guard(stage=2):
-            g = f - 1  # scale, ipu_stage : 2
+                g = f - 1  # scale, ipu_stage : 2
 
-        h = g + 1  # scale, ipu_stage : no
+            h = g + 1  # scale, ipu_stage : no
 
         ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
         for op in main_prog.global_block().ops:
             if op.desc.has_attr("ipu_stage"):
                 ipu_index_list.append(op.desc.attr("ipu_stage"))
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
index f120f5594914e..debd9ed19827c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -26,7 +26,13 @@ class TestIpuStrategy(unittest.TestCase):
     def test_set_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
         all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
+        skip_options = []
+        skip_options.append('random_seed')
+
         for option_name in all_option_names:
+            if option_name in skip_options:
+                continue
+
             option = ipu_strategy._ipu_strategy.get_option(option_name)
             option_type = option['type']
             option_value = option['value']
@@ -38,9 +44,13 @@ def test_set_options(self):
                 set_value = not option_value
             else:
                 continue
-            ipu_strategy.set_options({option_name: set_value})
-            new_value = ipu_strategy.get_option(option_name)
-            assert new_value == set_value, f"set {option_name} to {set_value} failed"
+
+            try:
+                ipu_strategy.set_options({option_name: set_value})
+                new_value = ipu_strategy.get_option(option_name)
+                assert new_value == set_value, f"set {option_name} to {set_value} failed"
+            except:
+                raise Exception(f"set {option_name} to {set_value} failed")
 
     def test_set_string_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
new file mode 100644
index 0000000000000..05572a72ea8b2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
@@ -0,0 +1,121 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestLogicalAnd(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_test_op()
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.logical_and
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype=self.feed_dtype[0])
+                y = paddle.static.data(
+                    name=self.feed_list[1],
+                    shape=self.feed_shape[1],
+                    dtype=self.feed_dtype[1])
+
+                out = self.op(x, y, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            return result[0]
+
+    def run_test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).astype(np.int32)
+
+        self.check(output_dict, check_shape=True)
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = ['bool', 'bool']
+
+    def set_data_feed0(self):
+        x = np.random.choice([True, False], size=(1, 3, 5, 5))
+        y = np.random.choice([True, False], size=(1, 3, 5, 5))
+        self.feed = {
+            "x": x.astype('bool'),
+            "y": y.astype('bool'),
+        }
+        self.set_feed_attr()
+
+    def test_case0(self):
+        self.set_data_feed0()
+        self.set_op_attrs()
+        self.run_test_base()
+
+
+class TestLogicalOr(TestLogicalAnd):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.logical_or
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
new file mode 100644
index 0000000000000..33a5dc888c245
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
@@ -0,0 +1,110 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data1 = np.array([[1], [1], [3], [0]])
+
+        self.feed = {'x': data1.astype(np.int32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"depth": 4, "allow_out_of_range": False}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int32')
+
+                out = paddle.fluid.layers.one_hot(x, **self.attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled):
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+@unittest.skip('does not support allow_out_of_range=True')
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"depth": 4, "allow_out_of_range": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
new file mode 100644
index 0000000000000..79fc9b04e1674
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
@@ -0,0 +1,110 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data1 = np.array([[1], [1], [3], [0]])
+
+        self.feed = {'x': data1.astype(np.int32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"depth": 4, "allow_out_of_range": False}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='int32')
+
+                out = paddle.fluid.input.one_hot(x, **self.attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+
+            return result[0]
+
+    def test_base(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled):
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+@unittest.skip('does not support allow_out_of_range=True')
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"depth": 4, "allow_out_of_range": True}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
index 1cc10da3d7344..bc9d05c4a87ec 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
@@ -91,6 +91,15 @@ def _test_optimizer(self, run_ipu=True):
                 ipu_strategy = paddle.static.IpuStrategy()
                 ipu_strategy.set_graph_config(is_training=True)
                 ipu_strategy.loss_scaling = self.attrs["loss_scaling"]
+                if "use_no_bias_optimizer" in self.attrs.keys():
+                    ipu_strategy.set_options({
+                        "use_no_bias_optimizer":
+                        self.attrs["use_no_bias_optimizer"]
+                    })
+                if "accl1_type" in self.attrs.keys():
+                    ipu_strategy.set_options({
+                        "accl1_type": self.attrs["accl1_type"]
+                    })
                 program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
@@ -141,6 +150,28 @@ def set_attrs(self):
         }
 
 
+@unittest.skip('cpu do not support AdamNoBias')
+class TestAdamNoBias(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "loss_scaling": 4.0,
+            "use_no_bias_optimizer": True,
+        }
+
+
+@unittest.skip('cpu do not support FLOAT16')
+class TestAdamCase3(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "loss_scaling": 4.0,
+            "accl1_type": "FLOAT16",
+        }
+
+
 @unittest.skip('seems cpu output wrong')
 class TestLambCase1(TestBase):
     def set_attrs(self):
@@ -161,5 +192,27 @@ def set_attrs(self):
         }
 
 
+@unittest.skip('cpu do not support LambNoBias')
+class TestLambNoBias(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.1,
+            "loss_scaling": 6.0,
+            "use_no_bias_optimizer": True
+        }
+
+
+@unittest.skip('cpu do not support FLOAT16')
+class TestLambCase2(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.1,
+            "loss_scaling": 6.0,
+            "accl1_type": "FLOAT16"
+        }
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index 3a69487306208..ba6eb4d38bcf2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -95,12 +95,9 @@ def _test_base(self, save_otherwise_load):
                     is_training=self.attrs['is_training'])
                 ipu_strategy.set_precision_config(
                     enable_fp16=self.attrs['enable_fp16'])
-                ipu_strategy.set_options({
-                    'save_per_n_step': self.attrs['save_at_step']
-                })
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(
-                        self.feed_list, fetch_list)
+                ipu_program = paddle.static.IpuCompiledProgram(
+                    main_prog, ipu_strategy=ipu_strategy)
+                program = ipu_program.compile(self.feed_list, fetch_list)
 
                 result = []
                 run_steps = self.attrs['steps'] if save_otherwise_load \
@@ -111,10 +108,9 @@ def _test_base(self, save_otherwise_load):
                 for i in range(run_steps):
                     tmp = exe.run(program, feed=feed, fetch_list=fetch_list)
 
-                    # currently, we update opt state every sess.run,
-                    # will optimize
                     if save_otherwise_load and \
                         i == self.attrs['save_at_step'] - 1:
+                        ipu_program._backend.weights_to_host()
                         paddle.static.save(main_prog,
                                            self.attrs['model_path'].name)
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
index 9a18922f35331..6702ae4344e91 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
@@ -88,11 +88,10 @@ def _test_base(self, exec_mode):
             if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
+                ipu_strategy.set_graph_config(
+                    is_training=self.is_training, micro_batch_size=2)
                 if exec_mode == ExecutionMode.IPU_POPART_FP16:
                     ipu_strategy.set_precision_config(enable_fp16=True)
-                # set batch size
-                ipu_strategy.micro_batch_size = 2
                 program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
index 66c547de2c280..2e84607e2f5c2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
@@ -25,17 +25,120 @@
 import hypothesis.strategies as st
 
 
-class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
+# the two inputs of elementwise_add are tensor
+class TestConvElementwiseAddMkldnnFusePass1(PassAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs
             for i in range(len(program_config.ops))
         ]
-        # If the problem has been fixed, the judgment 
-        # needs to be deleted!!!
-        if attrs[1]['data_format'] == "NHWC":
+        if attrs[1]['data_format'] == "NHWC" and attrs[3]['axis'] == 0:
+            return False
+        if attrs[1]['data_format'] == "NCHW" and attrs[3]['axis'] == -1:
             return False
+        return True
+
+    def sample_program_config(self, draw):
+        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+        dilations = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+        groups = draw(st.sampled_from([1, 2, 4]))
+        paddings = draw(st.sampled_from([[0, 3], [1, 1], [1, 2, 3, 4]]))
+        strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        axis = draw(st.sampled_from([-1, 0]))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        def generate_input():
+            if data_format == "NCHW":
+                return np.random.random(
+                    [batch_size, 48, 64, 64]).astype(np.float32)
+            else:
+                return np.random.random(
+                    [batch_size, 64, 64, 48]).astype(np.float32)
+
+        def generate_weight():
+            return np.random.random(
+                [48, int(48 / groups), 3, 3]).astype(np.float32)
+
+        relu_op = OpConfig(
+            type="relu",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["relu_out"]},
+            attrs={})
+
+        conv2d_op1 = OpConfig(
+            type="conv2d",
+            inputs={"Input": ["relu_out"],
+                    "Filter": ["conv_weight1"]},
+            outputs={"Output": ["conv_output1"]},
+            attrs={
+                "data_format": data_format,
+                "dilations": dilations,
+                "padding_algorithm": padding_algorithm,
+                "groups": groups,
+                "paddings": paddings,
+                "strides": strides
+            })
+
+        conv2d_op2 = OpConfig(
+            type="conv2d",
+            inputs={"Input": ["input_data"],
+                    "Filter": ["conv_weight2"]},
+            outputs={"Output": ["conv_output2"]},
+            attrs={
+                "data_format": data_format,
+                "dilations": dilations,
+                "padding_algorithm": padding_algorithm,
+                "groups": groups,
+                "paddings": paddings,
+                "strides": strides
+            })
+
+        elt_op = OpConfig(
+            type="elementwise_add",
+            inputs={"X": ["conv_output1"],
+                    "Y": ["conv_output2"]},
+            outputs={"Out": ["elementwise_output"]},
+            attrs={'axis': axis})
 
+        model_net = [relu_op, conv2d_op1, conv2d_op2, elt_op]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={
+                "conv_weight1": TensorConfig(data_gen=partial(generate_weight)),
+                "conv_weight2": TensorConfig(data_gen=partial(generate_weight))
+            },
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["elementwise_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["relu", "conv2d", "conv2d"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"])
+
+
+'''
+class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+        if "elementwise_weight" in program_config.weights:
+            if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[1]:
+                if attrs[2]['axis'] != 1:
+                    return False
+            if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[3]:
+                if attrs[2]['axis'] != -1:
+                    return False
         return True
 
     def sample_program_config(self, draw):
@@ -101,7 +204,7 @@ def generate_weight2():
                 "strides": strides
             })
 
-        if axis == -1 or axis == 0:
+        if axis == 0:
             elt_op = OpConfig(
                 type="elementwise_add",
                 inputs={"X": ["input_data1"],
@@ -118,14 +221,12 @@ def generate_weight2():
 
         model_net = [relu_op, conv2d_op, elt_op]
 
-        if axis == 1:
+        if axis == 0:
             program_config = ProgramConfig(
                 ops=model_net,
                 weights={
                     "conv_weight":
-                    TensorConfig(data_gen=partial(generate_weight1)),
-                    "elementwise_weight":
-                    TensorConfig(data_gen=partial(generate_weight2))
+                    TensorConfig(data_gen=partial(generate_weight1))
                 },
                 inputs={
                     "input_data1":
@@ -137,7 +238,9 @@ def generate_weight2():
                 ops=model_net,
                 weights={
                     "conv_weight":
-                    TensorConfig(data_gen=partial(generate_weight1))
+                    TensorConfig(data_gen=partial(generate_weight1)),
+                    "elementwise_weight":
+                    TensorConfig(data_gen=partial(generate_weight2))
                 },
                 inputs={
                     "input_data1":
@@ -154,7 +257,7 @@ def sample_predictor_configs(self, program_config):
     def test(self):
         self.run_and_statis(
             quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"])
-
+'''
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
index 33df428388882..81bb182802ede 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
@@ -19,6 +19,7 @@
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
+import paddle
 
 import hypothesis
 from hypothesis import given, settings, seed, example, assume
@@ -104,4 +105,5 @@ def test(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py
new file mode 100644
index 0000000000000..57fa56acd6875
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py
@@ -0,0 +1,145 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestCheckFiniteAndUnscaleOp(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "check_finite_and_unscale"
+        self.init_dtype()
+        self.init_test_case()
+
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([0]),
+            'Out': [('out0', x / scale)],
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestCheckFiniteAndUnscaleOpWithNan(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        x[128][128] = np.nan
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x)],
+        }
+
+    def test_check_output(self):
+        # When input contains nan, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpWithInf(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x = np.random.random((129, 129)).astype(self.dtype)
+        x[128][128] = np.inf
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpMultiInput(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([0]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+
+class TestCheckFiniteAndUnscaleOpMultiInputWithNan(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x0[128][128] = np.nan
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+class TestCheckFiniteAndUnscaleOpMultiInputWithInf(TestCheckFiniteAndUnscaleOp):
+    def init_test_case(self):
+        x0 = np.random.random((129, 129)).astype(self.dtype)
+        x0[128][128] = np.nan
+        x1 = np.random.random((129, 129)).astype(self.dtype)
+        x1[128][128] = np.inf
+        scale = np.random.random((1)).astype(self.dtype)
+
+        self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale}
+        self.outputs = {
+            'FoundInfinite': np.array([1]),
+            'Out': [('out0', x0 / scale), ('out1', x1 / scale)],
+        }
+
+    def test_check_output(self):
+        # When input contains inf, do not check the output, 
+        # since the output may be nondeterministic and will be discarded.
+        self.check_output_with_place(self.place, no_check_set=['Out'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
index 8e31d58195be8..e9d9af5c11366 100644
--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -22,4 +22,5 @@ if (WITH_ASCEND_CL)
     set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200)
     set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300)
     set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_elementwise_add_op_npu PROPERTIES TIMEOUT 200)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
index d51976e1a1962..71d4b45e61b18 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
@@ -71,7 +71,7 @@ class TestAssignValueNPUOp4(TestAssignValueNPUOp):
     def init_data(self):
         self.value = numpy.random.choice(
             a=[False, True], size=(2, 5)).astype(numpy.bool)
-        self.attrs["bool_values"] = [bool(v) for v in self.value.flat]
+        self.attrs["bool_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignApi(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
index 877f9904f3407..e01b2b691a28a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
@@ -144,6 +144,7 @@ def set_npu(self):
 
     def setUp(self):
         self.set_npu()
+        self.init_dtype()
         self.use_mkldnn = False
         self.fuse_with_relu = False
         self.data_formats = ["NCHW", "NHWC"]
@@ -153,6 +154,9 @@ def setUp(self):
         self.init_kernel_type()
         self.init_test_case()
 
+    def init_dtype(self):
+        self.dtype = np.float32
+
     def init_test_case(self):
         self.use_global_stats = False
         self.no_grad_set = set()
@@ -210,11 +214,16 @@ def test_with_place(place, data_layout, shape):
             scale_shape = [c]
 
             np.random.seed(123)
-            x = np.random.random_sample(shape).astype(np.float32)
+            x = np.random.random_sample(shape).astype(self.dtype)
             scale = np.random.random_sample(scale_shape).astype(np.float32)
             bias = np.random.random_sample(scale_shape).astype(np.float32)
             mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
-            y_grad = np.random.random_sample(shape).astype(np.float32)
+
+            if self.dtype == np.float16:
+                mean = mean.astype(np.float32)
+                variance = variance.astype(np.float32)
+
+            y_grad = np.random.random_sample(shape).astype(self.dtype)
             momentum_var = np.array([momentum]).astype(np.float32)
 
             y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
@@ -275,7 +284,7 @@ def test_with_place(place, data_layout, shape):
                     inputs=inputs,
                     outputs=outputs,
                     attrs=attrs)
-                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
+                block.create_var(name='y@GRAD', dtype=self.dtype, shape=y.shape)
 
                 # generate backward op_desc
                 grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
@@ -320,6 +329,11 @@ def init_kernel_type(self):
         pass
 
 
+class TestFP16BatchNormOpTraining(TestBatchNormOpTraining):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining):
     def init_test_case(self):
         self.use_global_stats = False
diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
index 012a6e59e775f..2e15a1eac2b4b 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py
@@ -132,36 +132,50 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-2)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         if self.dilations[0] == 1 and self.dilations[1] == 1:
-            self.check_grad_with_place(
-                self.place, {'Input', 'Filter'},
-                'Output',
-                max_relative_error=0.03,
-                numeric_place=paddle.CPUPlace())
+            if self.dtype == np.float16:
+                self.check_grad_with_place(
+                    self.place, {'Input', 'Filter'},
+                    'Output',
+                    max_relative_error=0.9)
+            else:
+                self.check_grad_with_place(
+                    self.place, {'Input', 'Filter'},
+                    'Output',
+                    max_relative_error=0.03,
+                    numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['Input'],
-            'Output',
-            no_grad_set=set(['Filter']),
-            max_relative_error=0.03,
-            numeric_place=paddle.CPUPlace())
-
-    def test_check_grad_no_input(self):
-        if self.dtype == np.float16:
-            return
-        if self.dilations[0] == 1 and self.dilations[1] == 1:
             self.check_grad_with_place(
-                self.place, ['Filter'],
+                self.place, ['Input'],
                 'Output',
-                no_grad_set=set(['Input']),
+                no_grad_set=set(['Filter']),
+                max_relative_error=0.9)
+        else:
+            self.check_grad_with_place(
+                self.place, ['Input'],
+                'Output',
+                no_grad_set=set(['Filter']),
                 max_relative_error=0.03,
                 numeric_place=paddle.CPUPlace())
 
+    def test_check_grad_no_input(self):
+        if self.dilations[0] == 1 and self.dilations[1] == 1:
+            if self.dtype == np.float16:
+                self.check_grad_with_place(
+                    self.place, ['Filter'],
+                    'Output',
+                    no_grad_set=set(['Input']),
+                    max_relative_error=0.9)
+            else:
+                self.check_grad_with_place(
+                    self.place, ['Filter'],
+                    'Output',
+                    no_grad_set=set(['Input']),
+                    max_relative_error=0.03,
+                    numeric_place=paddle.CPUPlace())
+
     def init_data_format(self):
         self.data_format = "NCHW"
 
@@ -267,32 +281,46 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.03,
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                self.place, {'Input', 'Filter'},
+                'Output',
+                max_relative_error=1.2)
+        else:
+            self.check_grad_with_place(
+                self.place, {'Input', 'Filter'},
+                'Output',
+                max_relative_error=0.03,
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['Input'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Filter']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                self.place, ['Input'],
+                'Output',
+                max_relative_error=0.7,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad_with_place(
+                self.place, ['Input'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Filter']),
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['Filter'],
-            'Output',
-            max_relative_error=0.03,
-            no_grad_set=set(['Input']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                self.place, ['Filter'],
+                'Output',
+                max_relative_error=0.8,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad_with_place(
+                self.place, ['Filter'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Input']),
+                numeric_place=paddle.CPUPlace())
 
     def init_data_format(self):
         self.data_format = "NCHW"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
index d0dc86055a163..4070d0267d95b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py
@@ -127,8 +127,6 @@ def test_check_output(self):
         self.check_output_with_place(fluid.NPUPlace(0), atol=1e-2)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             fluid.NPUPlace(0), {'Input', 'Filter'},
             'Output',
@@ -136,8 +134,6 @@ def test_check_grad(self):
             numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             fluid.NPUPlace(0), ['Input'],
             'Output',
@@ -146,8 +142,6 @@ def test_check_grad_no_filter(self):
             numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             fluid.NPUPlace(0), ['Filter'],
             'Output',
@@ -276,10 +270,13 @@ class TestConv2DOp_v2(OpTest):
     def set_npu(self):
         self.__class__.use_npu = True
 
+    def init_dtype(self):
+        self.dtype = np.float32
+
     def setUp(self):
         self.set_npu()
         self.op_type = "conv2d"
-        self.dtype = np.float32
+        self.init_dtype()
         self.init_kernel_type()
         self.init_group()
         self.init_dilation()
@@ -320,31 +317,45 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            paddle.NPUPlace(0), {'Input', 'Filter'},
-            'Output',
-            max_relative_error=0.02,
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), {'Input', 'Filter'},
+                'Output',
+                max_relative_error=1.1)
+        else:
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), {'Input', 'Filter'},
+                'Output',
+                max_relative_error=0.02,
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            paddle.NPUPlace(0), ['Input'],
-            'Output',
-            max_relative_error=0.02,
-            no_grad_set=set(['Filter']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Input'],
+                'Output',
+                max_relative_error=0.99,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Input'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Filter']),
+                numeric_place=paddle.CPUPlace())
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            paddle.NPUPlace(0), ['Filter'],
-            'Output',
-            no_grad_set=set(['Input']),
-            numeric_place=paddle.CPUPlace())
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Filter'],
+                'Output',
+                max_relative_error=0.99,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad_with_place(
+                paddle.NPUPlace(0), ['Filter'],
+                'Output',
+                no_grad_set=set(['Input']),
+                numeric_place=paddle.CPUPlace())
 
     def init_test_case(self):
         self.pad = [0, 0]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
index 9b29fc812faed..a4769442b083e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
@@ -51,8 +51,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-7)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
index bd9022f56a3e7..fea8502f2d766 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
@@ -56,8 +56,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
index 75c70e0a131ac..f24c6c455a0cb 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
@@ -65,36 +65,59 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16 or self.dtype == np.int64:
+        if self.dtype == np.int64:
             return
 
-        self.check_grad_with_place(
-            self.place,
-            ['X', 'Y'],
-            'Out',
-            max_relative_error=0.006, )
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place,
+                ['X', 'Y'],
+                'Out',
+                max_relative_error=0.15, )
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X', 'Y'],
+                'Out',
+                max_relative_error=0.006, )
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16 or self.dtype == np.int64:
+        if self.dtype == np.int64:
             return
 
-        self.check_grad_with_place(
-            self.place,
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-            max_relative_error=0.006, )
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place,
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=0.92, )
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=0.006, )
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16 or self.dtype == np.int64:
+        if self.dtype == np.int64:
             return
 
-        self.check_grad_with_place(
-            self.place,
-            ['X'],
-            'Out',
-            no_grad_set=set("Y"),
-            max_relative_error=0.006, )
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place,
+                ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=0.8, )
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=0.006, )
 
 
 class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
index 461e15352e383..cbfc07f354479 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
@@ -116,19 +116,13 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['Y'], 'Out', no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'], 'Out', no_grad_set=set("Y"))
 
@@ -213,15 +207,11 @@ def init_input_output(self):
         self.out = np.maximum(self.x, self.y.reshape(1, 1, 100))
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16:
-            return
         _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['Y'],
@@ -230,8 +220,6 @@ def test_check_grad_ingore_x(self):
             user_defined_grads=[dy])
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16:
-            return
         dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X'],
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
index 51cf5cdaf6d1a..e191224df81ee 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
@@ -64,32 +64,41 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(
-            self.place,
-            ['X', 'Y'],
-            'Out', )
+            self.check_grad_with_place(
+                self.place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X', 'Y'],
+                'Out', )
 
     def test_check_grad_ingore_x(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(
-            self.place,
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"), )
+            self.check_grad_with_place(
+                self.place, ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                max_relative_error=0.9)
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"), )
 
     def test_check_grad_ingore_y(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(
-            self.place,
-            ['X'],
-            'Out',
-            no_grad_set=set("Y"), )
+            self.check_grad_with_place(
+                self.place, ['X'],
+                'Out',
+                no_grad_set=set("Y"),
+                max_relative_error=0.1)
+        else:
+            self.check_grad_with_place(
+                self.place,
+                ['X'],
+                'Out',
+                no_grad_set=set("Y"), )
 
 
 class TestElementwiseMinOpFp16(TestElementwiseMinOp):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
index ce645f317d054..907e149c8b2c3 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
@@ -114,8 +114,6 @@ def init_input_output(self):
         self.out = np.power(self.x, self.y)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
@@ -184,8 +182,6 @@ def init_input_output(self):
         self.out = np.power(self.x, self.y)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
@@ -218,8 +214,6 @@ def init_input_output(self):
         self.out = np.power(self.x, self.y.reshape(1, 100, 1))
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
@@ -252,8 +246,6 @@ def init_input_output(self):
         self.out = np.power(self.x, self.y.reshape(100, 1, 1))
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
         self.check_grad_with_place(
             self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy])
diff --git a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
index ccd5f0649d8dc..6be2fe0086b12 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py
@@ -50,8 +50,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def init_dtype(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
index 89ac9e09aa348..83b65630d801a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
@@ -34,7 +34,7 @@ def setUp(self):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        x = np.random.randn(30, 1, 7).astype(self.dtype)
         out = np.tile(x, [1, 10, 1])
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
@@ -50,12 +50,8 @@ def init_dtype(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
 
 
 class TestExpandV2(TestExpand):
@@ -66,7 +62,7 @@ def setUp(self):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        x = np.random.randn(30, 1, 7).astype(self.dtype)
         out = np.tile(x, [1, 10, 1])
         expand_times = np.array([1, 10, 1]).astype(np.int32)
 
@@ -145,7 +141,7 @@ def setUp(self):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        x = np.random.randn(30, 1, 7).astype(self.dtype)
         out = np.tile(x, [1, 1, 1])
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
index d7aafccc88cf8..f1d89cb8d561b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
@@ -59,9 +59,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-5)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def set_npu(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
index 32042ba83a9f7..9495cdb8a55aa 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py
@@ -66,8 +66,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         # There is a problem that precision of grad result using float32
         # can't satisfy the default precision requirement 
         # when compared with numeric_grads, but the results on 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
index 1c9f499d22db4..a9c195bb8cd29 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py
@@ -81,13 +81,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['Y'],
             'Out',
@@ -95,8 +91,6 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("residual"))
 
     def test_check_grad_ingore_y(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'],
             'Out',
diff --git a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
index 6e5b4c012053f..d02ddae461ba5 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py
@@ -78,8 +78,10 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['X'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.5)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
index 590a961269989..a0472f9611eb0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py
@@ -63,8 +63,10 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['X'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.006)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestLeadyReluFP16(TestLeadyRelu):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
index 9534431e99a7a..5da3cb0ce5650 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
@@ -50,12 +50,8 @@ def init_dtype(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
 
 
 class TestLogFp16(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
index f6baefec7f29e..10ec8621ffa58 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py
@@ -63,9 +63,13 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(
-            self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
+            self.check_grad_with_place(
+                self.place, ['X'], ['Out'],
+                user_defined_grads=[self.x_grad],
+                max_relative_error=0.02)
+        else:
+            self.check_grad_with_place(
+                self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
 
 
 def test_class(op_type, typename):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index fefff0974ae40..8ec9eb1cf3572 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -77,8 +77,10 @@ def test_check_output(self):
 
     def test_check_grad(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['W'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['W'], 'Out', max_relative_error=0.01)
+        else:
+            self.check_grad_with_place(self.place, ['W'], 'Out')
 
 
 class TestLookupTableV2FP16(TestLookupTableV2):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
index f3df1fca30749..ec51dcf3f8e3e 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py
@@ -39,10 +39,11 @@ def setUp(self):
         self.set_npu()
         self.out_size = None
         self.actual_shape = None
+        self.init_dtype()
         self.data_layout = 'NCHW'
         self.init_test_case()
         self.op_type = "nearest_interp_v2"
-        input_np = np.random.random(self.input_shape).astype("float32")
+        input_np = np.random.random(self.input_shape).astype(self.dtype)
 
         if self.data_layout == "NCHW":
             in_h = self.input_shape[2]
@@ -95,8 +96,21 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', in_place=True, max_relative_error=0.006)
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place, ['X'],
+                'Out',
+                in_place=True,
+                max_relative_error=0.02)
+        else:
+            self.check_grad_with_place(
+                self.place, ['X'],
+                'Out',
+                in_place=True,
+                max_relative_error=0.006)
+
+    def init_dtype(self):
+        self.dtype = np.float32
 
     def init_test_case(self):
         self.interp_method = 'nearest'
@@ -108,6 +122,11 @@ def init_test_case(self):
         self.align_corners = False
 
 
+class TestNearestNeighborInterpFP16(TestNearestInterpOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 class TestNearestNeighborInterpCase1(TestNearestInterpOp):
     def init_test_case(self):
         self.interp_method = 'nearest'
diff --git a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
index 2c41f09ff5148..8e28b3fe413b0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py
@@ -54,9 +54,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-
         self.check_grad_with_place(
             self.place, ['X'], 'Out', max_relative_error=0.006)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
index 3b75cba60b103..a7ca4edc524be 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py
@@ -51,8 +51,6 @@ def test_check_output(self):
             self.check_output_with_place(paddle.NPUPlace(0))
 
     def test_check_grad(self):
-        if self.dtype == "float16":
-            return
         self.check_grad_with_place(
             paddle.NPUPlace(0), ['X'], 'Out', user_defined_grads=self.gradient)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
index 7d6c3b9bdb444..d1d2e8b3467be 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
@@ -50,9 +50,10 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-
-        self.check_grad_with_place(self.place, ['X'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.6)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def set_npu(self):
         self.__class__.use_npu = True
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
index 2b8550a88de59..4822abc3b25eb 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
@@ -67,9 +67,6 @@ def init_kernel_type(self):
             self.use_cudnn = False
             self.dtype = np.float16
 
-        def test_check_grad(self):
-            return
-
     cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op")
     TestFp16Case.__name__ = cls_name
     globals()[cls_name] = TestFp16Case
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
index e8f5de005d421..899d4ef43bd86 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
@@ -40,8 +40,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'], 'Out', max_relative_error=0.01)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
index 601a351c015f3..b1cb5e02a731f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
@@ -56,8 +56,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
     def init_dtype(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
index a2547808e6f16..c909b14b5141f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
@@ -34,11 +34,12 @@ def setUp(self):
 
         self.init_dtype()
         np.random.seed(SEED)
-        x = np.random.rand(3, 2).astype(self.dtype)
-        out = x
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {}
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
     def set_npu(self):
@@ -50,32 +51,18 @@ def init_dtype(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', max_relative_error=0.006)
+        else:
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
-class TestReluFp16(OpTest):
-    def setUp(self):
-        self.set_npu()
-        self.op_type = "relu"
-        self.place = paddle.NPUPlace(0)
-
-        self.init_dtype()
-        np.random.seed(SEED)
-        x = np.random.rand(3, 2).astype(self.dtype)
-        out = x
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {}
-        self.outputs = {'Out': out}
-
-    def set_npu(self):
-        self.__class__.use_npu = True
-        self.__class__.no_need_check_grad = True
 
+class TestReluFp16(TestRelu):
     def init_dtype(self):
         self.dtype = np.float16
 
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-5)
-
 
 class TestReluNeg(OpTest):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
index 4516b25b59d9c..489f8bfb116a1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py
@@ -44,8 +44,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(
             self.place, ['X'], 'Out', max_relative_error=0.01)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
index 611691109e187..a5b203b6eea2a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
@@ -58,12 +58,17 @@ def set_npu(self):
         self.place = paddle.NPUPlace(0)
 
     def test_check_output(self):
-        self.check_output_with_place(self.place)
+        if self.dtype == np.float16:
+            self.check_output_with_place(self.place)
+        else:
+            self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['Input'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['Input'], 'Out', max_relative_error=0.02)
+        else:
+            self.check_grad_with_place(self.place, ['Input'], 'Out')
 
 
 class TestSliceOp2(TestSliceOp):
@@ -347,8 +352,10 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
-            return
-        self.check_grad_with_place(self.place, ['Input'], 'Out')
+            self.check_grad_with_place(
+                self.place, ['Input'], 'Out', max_relative_error=0.5)
+        else:
+            self.check_grad_with_place(self.place, ['Input'], 'Out')
 
 
 class TestSliceOpDecsDimFp16(TestSliceOpDecsDim):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
index 8d78ee6a97efd..f0ca778834576 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
@@ -87,8 +87,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         # fp32 has low precision, cpu and npu both need to relax the max_relative_error if using fp32
         self.check_grad_with_place(
             self.place, ['Logits'],
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
index acb99746d231d..24b34fa625c63 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
@@ -50,12 +50,11 @@ def init_dtype(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
+        else:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
 
 
 class TestSqrtFp16(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
index caf55b4850f0b..170f6b6ca4f93 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
@@ -51,8 +51,6 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
index 55be94da2b7e0..375eef12291ec 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
@@ -50,12 +50,11 @@ def init_dtype(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
+        else:
+            self.check_grad(['X'], 'Out', max_relative_error=0.009)
 
 
 class TestTanhFp16(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 457f20ac5b06b..2d678db4dfcb4 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -698,7 +698,10 @@ def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place):
                 + str(np_dyg) + "\n" + "But Got" + str(np_api) + " in class " +
                 self.__class__.__name__)
 
-    def _calc_python_api_output(self, place):
+    def _calc_python_api_output(self, place, egr_inps=None, egr_oups=None):
+        """ set egr_inps and egr_oups = None if you want to create it by yourself.
+        """
+
         def prepare_python_api_arguments(api, op_proto_ins, op_proto_attrs,
                                          kernel_sig):
             """ map from `op proto inputs and attrs` to `api input list and api attrs dict`
@@ -715,10 +718,11 @@ def get_default(idx, all_params_number, defaults):
                 assert related_idx >= 0, "%d-th arguments don't have default value" % idx
                 return defaults[related_idx]
 
-            def remove_name(x):
-                if isinstance(x, list): return [i for i in x if i != 'name']
+            def filter_by_name(x):
+                names = set(['name', 'out', 'output'])
+                if isinstance(x, list): return [i for i in x if i not in names]
                 if isinstance(x, dict):
-                    return {k: v for k, v in x.items() if k != 'name'}
+                    return {k: v for k, v in x.items() if k not in names}
                 assert False, "Only support list or dict."
 
             def to_defaults_list(params, defaults):
@@ -728,7 +732,7 @@ def to_defaults_list(params, defaults):
             # Because we don't know the python api name of each arguments.
             # using parse_arg_and_kwargs, we can get the all api information we need.
             api_params, api_defaults = [
-                remove_name(item) for item in parse_arg_and_kwargs(api)
+                filter_by_name(item) for item in parse_arg_and_kwargs(api)
             ]
             api_defaults = to_defaults_list(api_params, api_defaults)
             inputs_sig, attrs_sig, outputs_sig = kernel_sig
@@ -752,10 +756,15 @@ def to_defaults_list(params, defaults):
         def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
             if not isinstance(ret_tuple, (tuple, list)):
                 ret_tuple = [ret_tuple]
-            assert len(output_sig) == len(
-                ret_tuple), "expect %d outputs, but get %d outputs" % (
-                    len(output_sig), len(ret_tuple))
-            return {a: b for a, b in zip(output_sig, ret_tuple)}
+            if len(output_sig) == len(ret_tuple):
+                # [assumption]: we assume {"Out": [Tensor]}
+                return {a: [b] for a, b in zip(output_sig, ret_tuple)}
+            else:
+                # [assumption]: return multi-Tensor in a single output. such as paddle.split()
+                assert len(
+                    output_sig
+                ) == 1, "Don't support multi-output with multi-tensor output."
+                return {output_sig[0]: ret_tuple}
 
         def assumption_assert_and_transform(args, inp_num):
             """
@@ -774,6 +783,18 @@ def assumption_assert_and_transform(args, inp_num):
             ] + args[inp_num:]
             return args
 
+        def _get_kernel_signature(eager_tensor_inputs, eager_tensor_outputs,
+                                  attrs_outputs):
+            try:
+                kernel_sig = _dygraph_tracer()._get_kernel_signature(
+                    self.op_type, eager_tensor_inputs, eager_tensor_outputs,
+                    attrs_outputs)
+            except RuntimeError as re:
+                """ we think the kernel_sig is missing.
+                """
+                kernel_sig = None
+            return kernel_sig
+
         def cal_python_api(python_api, args, kernel_sig):
             inputs_sig, attrs_sig, outputs_sig = kernel_sig
             args = assumption_assert_and_transform(args, len(inputs_sig))
@@ -784,10 +805,10 @@ def cal_python_api(python_api, args, kernel_sig):
             block = fluid.default_main_program().global_block()
             op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
             # prepare input variable
-            inputs = self.append_input_output_for_dygraph(op_proto, self.inputs,
-                                                          True, False, block)
+            eager_tensor_inputs = egr_inps if egr_inps else self.append_input_output_for_dygraph(
+                op_proto, self.inputs, True, False, block)
             # prepare output variable
-            outputs = self.append_input_output_for_dygraph(
+            eager_tensor_outputs = egr_oups if egr_oups else self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
             # prepare attrbutes
@@ -797,14 +818,15 @@ def cal_python_api(python_api, args, kernel_sig):
                     if self.attrs[attrs_name] is not None:
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
 
-            kernel_sig = _dygraph_tracer()._get_kernel_signature(
-                self.op_type, inputs, outputs, attrs_outputs)
-
+            kernel_sig = _get_kernel_signature(
+                eager_tensor_inputs, eager_tensor_outputs, attrs_outputs)
+            if not kernel_sig:
+                return None
             assert hasattr(
                 self, "python_api"
-            ), "Please set the `self.python_api` if you want to compare python api output."
-            args = prepare_python_api_arguments(self.python_api, inputs,
-                                                attrs_outputs, kernel_sig)
+            ), "Detect there is KernelSignature for `%s` op, please set the `self.python_api` if you set check_eager = True" % self.op_type
+            args = prepare_python_api_arguments(
+                self.python_api, eager_tensor_inputs, attrs_outputs, kernel_sig)
             """ we directly return the cal_python_api value because the value is already tensor. 
             """
             return cal_python_api(self.python_api, args, kernel_sig)
@@ -1283,14 +1305,13 @@ def check_output_with_place(self,
                 place, no_check_set=no_check_set)
 
         if check_eager:
-            with _test_eager_guard():
-                eager_dygraph_outs = self._calc_dygraph_output(
-                    place, no_check_set=no_check_set)
             # we only check end2end api when check_eager=True
-            if hasattr(self, "python_api"):
-                api_outs = self._calc_python_api_output(place)
-                self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs,
-                                                     place)
+            with _test_eager_guard():
+                eager_dygraph_outs = self._calc_python_api_output(place)
+                if eager_dygraph_outs is None:
+                    # missing KernelSignature, fall back to eager middle output.
+                    eager_dygraph_outs = self._calc_dygraph_output(
+                        place, no_check_set=no_check_set)
 
         outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
 
@@ -1824,7 +1845,7 @@ def check_grad_with_place(self,
         if check_dygraph:
             dygraph_grad = self._get_dygraph_grad(
                 inputs_to_check, place, output_names, user_defined_grad_outputs,
-                no_grad_set)
+                no_grad_set, False)
             fp32_grads = []
             for grad in dygraph_grad:
                 if grad.dtype == np.uint16:
@@ -1840,7 +1861,7 @@ def check_grad_with_place(self,
             with _test_eager_guard():
                 eager_dygraph_grad = self._get_dygraph_grad(
                     inputs_to_check, place, output_names,
-                    user_defined_grad_outputs, no_grad_set)
+                    user_defined_grad_outputs, no_grad_set, check_eager)
                 fp32_grads = []
                 for grad in eager_dygraph_grad:
                     if grad.dtype == np.uint16:
@@ -1866,7 +1887,8 @@ def _get_dygraph_grad(self,
                           place,
                           output_names,
                           user_defined_grad_outputs=None,
-                          no_grad_set=None):
+                          no_grad_set=None,
+                          check_eager=False):
         with fluid.dygraph.base.guard(place=place):
             block = fluid.default_main_program().global_block()
 
@@ -1887,11 +1909,16 @@ def _get_dygraph_grad(self,
                     if self.attrs[attrs_name] is not None:
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
 
-            block.append_op(
-                type=self.op_type,
-                inputs=inputs,
-                outputs=outputs,
-                attrs=attrs_outputs if hasattr(self, "attrs") else None)
+            if check_eager:
+                outputs = self._calc_python_api_output(place, inputs, outputs)
+
+            # if outputs is None, kernel sig is empty or other error is happens.
+            if not check_eager or outputs is None:
+                block.append_op(
+                    type=self.op_type,
+                    inputs=inputs,
+                    outputs=outputs,
+                    attrs=attrs_outputs if hasattr(self, "attrs") else None)
 
             if self.dtype == np.uint16:
                 cast_inputs = self._find_var_in_dygraph(outputs,
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index 50ea065209422..6c964a828eed7 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -123,17 +123,26 @@ def check_grad_with_place(self,
             return super().check_grad_with_place(
                 place, inputs_to_check, output_names, no_grad_set,
                 numeric_grad_delta, in_place, max_relative_error,
-                user_defined_grads, user_defined_grads, check_dygraph)
+                user_defined_grads, user_defined_grad_outputs, check_dygraph)
 
         a1 = self.get_grad_with_place(
-            place, inputs_to_check, output_names, no_grad_set=no_grad_set)
+            place,
+            inputs_to_check,
+            output_names,
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         a2 = self.get_grad_with_place(
-            place, inputs_to_check, output_names, no_grad_set=no_grad_set)
+            place,
+            inputs_to_check,
+            output_names,
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         a3 = self.get_grad_with_place(
             paddle.CPUPlace(),
             inputs_to_check,
             output_names,
-            no_grad_set=no_grad_set)
+            no_grad_set=no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         self._assert_is_close(a1, a2, inputs_to_check, 0.00000001,
                               "Gradient Check On two xpu")
         self._assert_is_close(a1, a3, inputs_to_check, max_relative_error,
@@ -147,7 +156,7 @@ def get_grad_with_place(self,
                             numeric_grad_delta=0.005,
                             in_place=False,
                             max_relative_error=0.005,
-                            user_defined_grads=None,
+                            user_defined_grad_outputs=None,
                             check_dygraph=True):
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
@@ -197,6 +206,10 @@ def get_grad_with_place(self,
         if not type(output_names) is list:
             output_names = [output_names]
 
-        analytic_grads = self._get_gradient(inputs_to_check, place,
-                                            output_names, no_grad_set)
+        analytic_grads = self._get_gradient(
+            inputs_to_check,
+            place,
+            output_names,
+            no_grad_set,
+            user_defined_grad_outputs=user_defined_grad_outputs)
         return analytic_grads
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
index 8ff68a1ce0d69..91c340c35d478 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py
@@ -19,6 +19,7 @@
 import os
 import numpy as np
 import random
+import socket
 
 import paddle
 import paddle.nn as nn
@@ -31,13 +32,26 @@
 from paddle.fluid.initializer import NumpyArrayInitializer
 
 
+def net_is_used(port, ip='127.0.0.1'):
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    try:
+        s.connect((ip, port))
+        s.shutdown(2)
+        return True
+    except Exception as e:
+        return False
+
+
 def init_process_group(strategy=None):
     nranks = ParallelEnv().nranks
     rank = ParallelEnv().local_rank
     is_master = True if rank == 0 else False
-    store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks)
-    group = core.ProcessGroupNCCL(store, rank, nranks)
-    return group
+    for port in range(20000, 21000):
+        if not net_is_used(port):
+            store = paddle.fluid.core.TCPStore("127.0.0.1", port, is_master,
+                                               nranks)
+            group = core.ProcessGroupNCCL(store, rank, nranks)
+            return group
 
 
 class LinearModel(nn.Layer):
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
new file mode 100644
index 0000000000000..214f41c78a3a5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import os
+
+import paddle
+import numpy as np
+import paddle.distributed as dist
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+import paddle.fluid.core as core
+
+paddle.seed(1024)
+np.random.seed(2021)
+
+batch = 5
+in_dim = 10
+out_dim = 20
+
+
+def init_process_group(strategy=None):
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6174, is_master, nranks)
+    group = core.ProcessGroupNCCL(store, rank, nranks)
+    return group
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self, train_id):
+        super(SimpleNet, self).__init__()
+        self.w1 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.w2 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.share_net = Linear(out_dim, 10)
+
+        self.unused_param = self.create_parameter(
+            shape=[out_dim, in_dim], dtype="float64")
+
+        # just for test sync_params_buffers
+        # self.register_buffer("queue", paddle.randn([10, 5]))
+        # self.queue = paddle.nn.functional.normalize(self.queue, axis=0)
+        # self.register_buffer("queue_ptr", paddle.zeros([1], 'int64'))
+
+        self.trainer_id = train_id
+
+    def forward(self, x):
+        is_use = (paddle.equal_all(
+            x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and
+                  self.trainer_id == 1)
+
+        if is_use:
+            tmp = paddle.matmul(x, self.w1)
+        else:
+            tmp = paddle.matmul(x, self.w2)
+
+        return self.share_net(tmp)
+
+
+class TestDistTraning(unittest.TestCase):
+    def test_multiple_gpus(self):
+        dist.init_parallel_env()
+        self.trainer_id = dist.get_rank()
+
+        process_group = init_process_group()
+        self.pg = process_group
+        with _test_eager_guard():
+
+            model_a = SimpleNet(self.trainer_id)
+            model_b = SimpleNet(self.trainer_id)
+
+            state_dict = model_a.state_dict()
+            model_b.set_state_dict(state_dict)
+
+            model_a = paddle.DataParallel(
+                model_a,
+                find_unused_parameters=True,
+                process_group=process_group)
+            model_b = paddle.DataParallel(
+                model_b,
+                find_unused_parameters=True,
+                process_group=process_group)
+
+            ones_input = paddle.ones(shape=(batch, in_dim))
+            ones_input.stop_gradient = True
+
+            w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+            w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+
+            for step_id in range(5):
+                print("==============", step_id)
+                random_input = paddle.rand(shape=(batch, in_dim))
+                random_input.stop_gradient = True
+
+                if step_id % 2 == 0:
+                    out_a = model_a(random_input)
+                    out_b = model_b(random_input)
+                else:
+                    out_a = model_a(ones_input)
+                    out_b = model_b(ones_input)
+
+                out_a.sum().backward()
+                out_b.sum().backward()
+
+                self.check_gradient(model_a.parameters())
+                self.check_gradient(model_b.parameters())
+
+                # test acc gradient
+                w1_grad_sum = self.check_acc(model_a._layers.w1.grad,
+                                             w1_grad_sum,
+                                             model_b._layers.w1.grad)
+                w2_grad_sum = self.check_acc(model_a._layers.w2.grad,
+                                             w2_grad_sum,
+                                             model_b._layers.w2.grad)
+
+                model_a.clear_gradients()
+
+    def check_acc(self, grad, grad_sum, acc_grad):
+        if grad is not None:
+            grad_sum = grad_sum + grad.numpy()
+            acc_grad = acc_grad.numpy() if acc_grad is not None else None
+            np.testing.assert_allclose(grad_sum, acc_grad, rtol=1e-6)
+        return grad_sum
+
+    def print_trainer_0(self, *args):
+        if self.trainer_id == 0:
+            print(*args)
+
+    def broadcast_param(self, param, root):
+        self.pg.broadcast(param, root)
+        return param
+
+    def check_gradient(self, params):
+        other_param = []
+        for param in params:
+            if param.trainable and (param.grad is not None):
+                grad = param.grad
+                other_grad = self.broadcast_param(grad, root=1)
+                if self.trainer_id == 0:
+                    np.testing.assert_allclose(other_grad.numpy(), grad.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py
index c62c4615f7470..b1f3a71ab3e94 100644
--- a/python/paddle/fluid/tests/unittests/process_group_gloo.py
+++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py
@@ -47,9 +47,7 @@ def test_create_process_group_gloo(self):
             is_master = True if rank == 0 else False
             store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master,
                                                nranks, datetime.timedelta(0))
-            gloo_store = paddle.fluid.core.GlooStore(store)
-            opt = paddle.fluid.core.GlooOptions()
-            pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks)
+            pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks)
 
             # test allreduce sum
             # rank 0
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
index 737c085dde6ac..34b6f6dc8e545 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_concat.py
@@ -20,6 +20,7 @@
 sys.path.append("../")
 from op_test import OpTest
 
+import paddle
 from paddle import fluid
 
 
@@ -115,4 +116,5 @@ def test_dtype():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_Tensor_type.py b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
index 59395b94279ea..f1427d29782b9 100644
--- a/python/paddle/fluid/tests/unittests/test_Tensor_type.py
+++ b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
@@ -39,6 +39,7 @@ def test_type_Tensor(self):
 
         tensorx = paddle.tensor.logic.Tensor(inx)
         typex_str = str(type(tensorx))
+
         expectx = "<class 'paddle.Tensor'>"
         self.assertEqual((typex_str == expectx), True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 5c40b898d2325..add49d11e53a1 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -1039,7 +1039,7 @@ def setUp(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_eager=True)
+        self.check_grad(['X'], 'Out', check_eager=False)
 
 
 class TestCeil(TestActivation):
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index ecac22553cbcd..d05c9a3c313bb 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -1202,4 +1202,5 @@ def test_main(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py
index 6238d7dd4a1f4..dcf07f4953200 100644
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -27,6 +27,7 @@ class TestAddMMOp(OpTest):
     # test basic
     def setUp(self):
         self.op_type = "addmm"
+        self.python_api = paddle.addmm
         self.dtype = np.float64
         self.init_dtype_type()
         self.inputs = {
@@ -43,19 +44,19 @@ def init_dtype_type(self):
         pass
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input', 'X', 'Y'], 'Out')
+        self.check_grad(['Input', 'X', 'Y'], 'Out', check_eager=False)
 
     def test_check_grad_x(self):
-        self.check_grad(['X'], 'Out', no_grad_set=None)
+        self.check_grad(['X'], 'Out', no_grad_set=None, check_eager=False)
 
     def test_check_grad_y(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=None)
+        self.check_grad(['Y'], 'Out', no_grad_set=None, check_eager=False)
 
     def test_check_grad_input(self):
-        self.check_grad(['Input'], 'Out', no_grad_set=None)
+        self.check_grad(['Input'], 'Out', no_grad_set=None, check_eager=False)
 
 
 class TestAddMMOpError(unittest.TestCase):
@@ -167,6 +168,7 @@ class TestAddMMOp2(TestAddMMOp):
     # test alpha and beta
     def setUp(self):
         self.op_type = "addmm"
+        self.python_api = paddle.addmm
         self.dtype = np.float64
         self.init_dtype_type()
         self.inputs = {
@@ -252,4 +254,5 @@ def test_error1():
 '''
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_value_op.py b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
index adf238c43d21a..2abdbdc5940f7 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
@@ -58,7 +58,7 @@ class TestAssignValueOp4(TestAssignValueOp):
     def init_data(self):
         self.value = numpy.random.choice(
             a=[False, True], size=(2, 5)).astype(numpy.bool)
-        self.attrs["bool_values"] = [bool(v) for v in self.value.flat]
+        self.attrs["bool_values"] = [int(v) for v in self.value.flat]
 
 
 class TestAssignApi(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_atan2_op.py b/python/paddle/fluid/tests/unittests/test_atan2_op.py
index b29ab822f25de..ca0e2d2ba6dda 100644
--- a/python/paddle/fluid/tests/unittests/test_atan2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_atan2_op.py
@@ -36,6 +36,7 @@ def atan2_grad(x1, x2, dout):
 class TestAtan2(OpTest):
     def setUp(self):
         self.op_type = "atan2"
+        self.python_api = paddle.atan2
         self.init_dtype()
 
         x1 = np.random.uniform(-1, -0.1, [15, 17]).astype(self.dtype)
@@ -46,10 +47,10 @@ def setUp(self):
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
-        self.check_grad(['X1', 'X2'], 'Out')
+        self.check_grad(['X1', 'X2'], 'Out', check_eager=True)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def init_dtype(self):
         self.dtype = np.float64
@@ -66,7 +67,8 @@ def test_check_grad(self):
                 'Out',
                 user_defined_grads=atan2_grad(self.inputs['X1'],
                                               self.inputs['X2'],
-                                              1 / self.inputs['X1'].size))
+                                              1 / self.inputs['X1'].size),
+                check_eager=True)
 
 
 class TestAtan2_float16(TestAtan2_float):
@@ -129,4 +131,5 @@ def run(place):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index b440e745b1082..789cfa82658f4 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -451,4 +451,5 @@ def test_to_api_numpy_dtype(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py
index ea1a22780f093..1051fa9c1aefa 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
@@ -244,4 +244,5 @@ def init_test_cast(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
index aba95a68ab790..9b99e553d182b 100644
--- a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
+++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
@@ -17,5 +17,4 @@
 set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
-launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
-CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op.py
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch c_comm_init_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py
index c31594b75e985..bb45a52566211 100644
--- a/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py
@@ -29,6 +29,7 @@
 paddle.enable_static()
 
 
+#cholesky_solve implement 1
 def cholesky_solution(X, B, upper=True):
     if upper:
         A = np.triu(X)
@@ -43,6 +44,7 @@ def cholesky_solution(X, B, upper=True):
             L, B, lower=True))
 
 
+#cholesky_solve implement 2
 def scipy_cholesky_solution(X, B, upper=True):
     if upper:
         umat = np.triu(X)
@@ -54,27 +56,29 @@ def scipy_cholesky_solution(X, B, upper=True):
     return scipy.linalg.cho_solve(K, B)
 
 
-def boardcast_shape(matA, matB):
+#broadcast function used by cholesky_solve
+def broadcast_shape(matA, matB):
     shapeA = matA.shape
     shapeB = matB.shape
-    Boardshape = []
+    Broadshape = []
     for idx in range(len(shapeA) - 2):
         if shapeA[idx] == shapeB[idx]:
-            Boardshape.append(shapeA[idx])
+            Broadshape.append(shapeA[idx])
             continue
         elif shapeA[idx] == 1 or shapeB[idx] == 1:
-            Boardshape.append(max(shapeA[idx], shapeB[idx]))
+            Broadshape.append(max(shapeA[idx], shapeB[idx]))
         else:
             raise Exception(
-                'shapeA and shapeB should be boardcasted, but got {} and {}'.
+                'shapeA and shapeB should be broadcasted, but got {} and {}'.
                 format(shapeA, shapeB))
-    bsA = Boardshape + list(shapeA[-2:])
-    bsB = Boardshape + list(shapeB[-2:])
+    bsA = Broadshape + list(shapeA[-2:])
+    bsB = Broadshape + list(shapeB[-2:])
     return np.broadcast_to(matA, bsA), np.broadcast_to(matB, bsB)
 
 
+#cholesky_solve implement in batch
 def scipy_cholesky_solution_batch(bumat, bB, upper=True):
-    bumat, bB = boardcast_shape(bumat, bB)
+    bumat, bB = broadcast_shape(bumat, bB)
     ushape = bumat.shape
     bshape = bB.shape
     bumat = bumat.reshape((-1, ushape[-2], ushape[-1]))
@@ -90,18 +94,21 @@ def scipy_cholesky_solution_batch(bumat, bB, upper=True):
     return np.array(bx).reshape(bshape)
 
 
-# 2D + 2D , , upper=False
+# test condition: shape: 2D + 2D , upper=False
+# based on OpTest class
 class TestCholeskySolveOp(OpTest):
     """
     case 1
     """
 
+    #test condition set
     def config(self):
         self.y_shape = [15, 15]
         self.x_shape = [15, 5]
         self.upper = False
-        self.dtype = np.float64
+        self.dtype = np.float64  #Here cholesky_solve Op only supports float64/float32 type, please check others if Op supports more types.
 
+    #get scipy result
     def set_output(self):
         umat = self.inputs['Y']
         self.output = scipy_cholesky_solution_batch(
@@ -124,14 +131,16 @@ def setUp(self):
         self.set_output()
         self.outputs = {'Out': self.output}
 
+    #check Op forward result
     def test_check_output(self):
         self.check_output()
 
+    #check Op grad
     def test_check_grad_normal(self):
         self.check_grad(['Y'], 'Out', max_relative_error=0.01)
 
 
-# 3D(broadcast) + 3D, upper=True
+# test condition:  3D(broadcast) + 3D, upper=True
 class TestCholeskySolveOp3(TestCholeskySolveOp):
     """
     case 3
@@ -144,11 +153,11 @@ def config(self):
         self.dtype = np.float64
 
 
+#API function test
 class TestCholeskySolveAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(2021)
         self.place = [paddle.CPUPlace()]
-        # self.place = [paddle.CUDAPlace(0)]
         self.dtype = "float64"
         self.upper = True
         if core.is_compiled_with_cuda():
@@ -177,10 +186,12 @@ def check_static_result(self, place):
                               fetch_list=[z])
             self.assertTrue(np.allclose(fetches[0], z_np))
 
+    #test in static mode
     def test_static(self):
         for place in self.place:
             self.check_static_result(place=place)
 
+    #test in dynamic mode
     def test_dygraph(self):
         def run(place):
             paddle.disable_static(place)
@@ -199,7 +210,8 @@ def run(place):
         for idx, place in enumerate(self.place):
             run(place)
 
-    def test_boardcast(self):
+    #test input with broadcast
+    def test_broadcast(self):
         def run(place):
             paddle.disable_static()
             x_np = np.random.random([1, 30, 2]).astype(self.dtype)
@@ -218,6 +230,7 @@ def run(place):
             run(place)
 
 
+#test condition out of bounds
 class TestCholeskySolveOpError(unittest.TestCase):
     def test_errors(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
index 58baa0a2fa944..e00f90f4b0d5f 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_process_group.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
@@ -25,6 +25,9 @@ def test_process_group_nccl(self):
     def test_process_group_gloo(self):
         self.run_mnist_2gpu('process_group_gloo.py')
 
+    def test_init_process_group(self):
+        self.run_mnist_2gpu('init_process_group.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index f92465b739a2a..bd9ec6b663f60 100755
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -30,12 +30,13 @@ def setUp(self):
             a = numpy.random.random(size=(10, 7)).astype(typename)
             b = numpy.random.random(size=(10, 7)).astype(typename)
             c = callback(a, b)
+            self.python_api = eval("paddle." + op_type)
             self.inputs = {'X': a, 'Y': b}
             self.outputs = {'Out': c}
             self.op_type = op_type
 
         def test_output(self):
-            self.check_output()
+            self.check_output(check_eager=False)
 
         def test_errors(self):
             paddle.enable_static()
@@ -338,4 +339,5 @@ def test_place_2(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_create_parameter.py b/python/paddle/fluid/tests/unittests/test_create_parameter.py
index 763fb64816c9c..199558acd4ef6 100644
--- a/python/paddle/fluid/tests/unittests/test_create_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_create_parameter.py
@@ -18,6 +18,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from paddle.fluid import ParamAttr, initializer
+import paddle
 
 
 class TestCreateParameterError(unittest.TestCase):
@@ -50,4 +51,5 @@ def test_default_initializer():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cross_op.py b/python/paddle/fluid/tests/unittests/test_cross_op.py
index 8e53a36f0510d..6cba72213ff97 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_op.py
@@ -26,6 +26,7 @@
 class TestCrossOp(OpTest):
     def setUp(self):
         self.op_type = "cross"
+        self.python_api = paddle.cross
         self.initTestCase()
         self.inputs = {
             'X': np.random.random(self.shape).astype(self.dtype),
@@ -47,10 +48,10 @@ def init_output(self):
         self.outputs = {'Out': np.array(z_list).reshape(self.shape)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out')
+        self.check_grad(['X', 'Y'], 'Out', check_eager=False)
 
 
 class TestCrossOpCase1(TestCrossOp):
@@ -114,14 +115,14 @@ def test_cross_api(self):
     def test_dygraph_api(self):
         self.input_data()
         # case 1:
-        with fluid.dygraph.guard():
-            x = fluid.dygraph.to_variable(self.data_x)
-            y = fluid.dygraph.to_variable(self.data_y)
-            z = paddle.cross(x, y)
-            np_z = z.numpy()
-        expect_out = np.array([[-1.0, -1.0, -1.0], [2.0, 2.0, 2.0],
-                               [-1.0, -1.0, -1.0]])
-        self.assertTrue(np.allclose(expect_out, np_z))
+        # with fluid.dygraph.guard():
+        #     x = fluid.dygraph.to_variable(self.data_x)
+        #     y = fluid.dygraph.to_variable(self.data_y)
+        #     z = paddle.cross(x, y)
+        #     np_z = z.numpy()
+        # expect_out = np.array([[-1.0, -1.0, -1.0], [2.0, 2.0, 2.0],
+        #                        [-1.0, -1.0, -1.0]])
+        # self.assertTrue(np.allclose(expect_out, np_z))
 
         # case 2:
         with fluid.dygraph.guard():
@@ -135,4 +136,5 @@ def test_dygraph_api(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index f5934debfd7b6..ffc5bc184efc2 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -20,6 +20,7 @@
 from op_test import OpTest
 from test_softmax_op import stable_softmax
 import paddle.fluid as fluid
+import paddle
 
 
 def CTCAlign(input, lod, blank, merge_repeated, padding=0, input_length=None):
@@ -229,4 +230,5 @@ def test_bad_x():
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_diag_v2.py b/python/paddle/fluid/tests/unittests/test_diag_v2.py
index 9f727608f816c..4047ccb8782c8 100644
--- a/python/paddle/fluid/tests/unittests/test_diag_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_diag_v2.py
@@ -27,6 +27,7 @@
 class TestDiagV2Op(OpTest):
     def setUp(self):
         self.op_type = "diag_v2"
+        self.python_api = paddle.diag
         self.x = np.random.rand(10, 10)
         self.offset = 0
         self.padding_value = 0.0
@@ -42,11 +43,11 @@ def setUp(self):
 
     def test_check_output(self):
         paddle.enable_static()
-        self.check_output(check_eager=True)
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
         paddle.enable_static()
-        self.check_grad(['X'], 'Out', check_eager=True)
+        self.check_grad(['X'], 'Out', check_eager=False)
 
     def init_config(self):
         pass
@@ -267,4 +268,5 @@ def test_gpu(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_diagonal_op.py b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
index b4854aea52a70..7db5fcb9625a6 100644
--- a/python/paddle/fluid/tests/unittests/test_diagonal_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diagonal_op.py
@@ -30,6 +30,7 @@
 class TestDiagonalOp(OpTest):
     def setUp(self):
         self.op_type = "diagonal"
+        self.python_api = paddle.diagonal
         self.init_config()
         self.outputs = {'Out': self.target}
 
diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py
index 1ae780f488d2d..4a96827bd7c3c 100644
--- a/python/paddle/fluid/tests/unittests/test_diff_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
@@ -211,4 +211,5 @@ def set_args(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_digamma_op.py b/python/paddle/fluid/tests/unittests/test_digamma_op.py
index 3cb31b888f431..4897becf61144 100644
--- a/python/paddle/fluid/tests/unittests/test_digamma_op.py
+++ b/python/paddle/fluid/tests/unittests/test_digamma_op.py
@@ -29,6 +29,7 @@ def setUp(self):
         paddle.enable_static()
 
         self.op_type = 'digamma'
+        self.python_api = paddle.digamma
         self.init_dtype_type()
         shape = (5, 32)
         data = np.random.random(shape).astype(self.dtype) + 1
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
index f95546f15f002..27d82fcc8903b 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -190,4 +190,5 @@ def test_check_output(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 27aec284de4cd..8166598677a3e 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -52,7 +52,7 @@ def test_retain_grad_and_run_backward(self):
             out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
             self.assertIsNone(data_eager.grad)
             out_eager.backward(grad_eager, False)
-            self.assertTrue(data_eager.grad._is_initialized())
+            self.assertIsNotNone(data_eager.grad)
             self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data))
 
     def test_retain_grad_and_run_backward_raises(self):
@@ -637,6 +637,10 @@ def test_copy_and_copy_to(self):
                 self.assertTrue(tensor3.persistable, True)
                 self.assertTrue(tensor3.stop_gradient, True)
                 self.assertTrue(tensor3.place.is_gpu_place())
+                tensor4 = paddle.to_tensor([1, 2, 3], place='gpu_pinned')
+                tensor5 = tensor4._copy_to(core.CUDAPlace(0), True)
+                self.assertTrue(
+                    np.array_equal(tensor4.numpy(), tensor5.numpy()))
             else:
                 tensor3 = tensor2._copy_to(core.CPUPlace(), True)
                 self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
diff --git a/python/paddle/fluid/tests/unittests/test_einsum.py b/python/paddle/fluid/tests/unittests/test_einsum.py
index 13e763bee6305..43b5ce96a3901 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum.py
@@ -26,14 +26,14 @@ def setUp(self):
     def test_diagonalize_errors(self):
         a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float')
         a = paddle.to_tensor(a)
-        with self.assertRaisesRegex(AssertionError, (
-                'Diagonal and trace not implemented yet.')):
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
             paddle.einsum('...ii->...i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                'Diagonal and trace not implemented yet.')):
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
             paddle.einsum('i...i', a)
-        with self.assertRaisesRegex(AssertionError, (
-                'Diagonal and trace not implemented yet.')):
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
             paddle.einsum('i...i->i...', a)
 
     def test_param_errors(self):
@@ -396,6 +396,51 @@ def test_large_nops(self):
         self.check_output('a...b,b...c,c...a', a, a, a)
         self.check_output('...ab,...ba,...ab,...ab', a, a, a, a)
 
+    def test_static_graph(self):
+        paddle.enable_static()
+        fluid = paddle.fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.place = fluid.CUDAPlace(0)
+        else:
+            self.place = fluid.CPUPlace()
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            a = paddle.static.data(
+                name='a', shape=[3, None, None, None], dtype='float')
+            b = paddle.static.data(
+                name='b', shape=[2, None, None, None], dtype='float')
+            c = paddle.static.data(
+                name='c', shape=[None, None, 2, None], dtype='float')
+            d = paddle.static.data(
+                name='d', shape=[None, None, 5], dtype='float')
+            e = paddle.static.data(
+                name='e', shape=[None, 2, None], dtype='float')
+
+            outs = []
+            outs.append(paddle.einsum("ibnd,jbnd->bnij", a, b))
+            outs.append(paddle.einsum('...ik, ...j', c, d))
+            outs.append(paddle.einsum('...kj, ...ik', d, e))
+            outs.append(paddle.einsum('ijk..., ikj', c, e))
+            outs.append(paddle.einsum('ijk..., ikj->...ij', c, e))
+        exe = fluid.Executor(self.place)
+        exe.run(startup)
+        a = np.arange(72).reshape(3, 2, 3, 4).astype('float')
+        b = np.arange(48).reshape(2, 2, 3, 4).astype('float')
+        c = np.arange(48).reshape(2, 3, 2, 4).astype('float')
+        d = np.arange(30).reshape(2, 3, 5).astype('float')
+        e = np.arange(12).reshape(2, 2, 3).astype('float')
+        feeds = {'a': a, 'b': b, 'c': c, 'd': d, 'e': e}
+        actual = exe.run(main, feed=feeds, fetch_list=[outs])
+        expect = []
+        expect.append(np.einsum("ibnd,jbnd->bnij", a, b))
+        expect.append(np.einsum('...ik, ...j', c, d))
+        expect.append(np.einsum('...kj, ...ik', d, e))
+        expect.append(np.einsum('ijk..., ikj', c, e))
+        expect.append(np.einsum('ijk..., ikj->...ij', c, e))
+        for a, e in zip(actual, expect):
+            self.check_output_equal(a, e)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index d1d391a3949ea..4ddfe9d1559de 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
@@ -40,16 +40,25 @@ def setUp(self):
         self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': self.out}
 
+    def check_eager(self):
+        return False
+        #return (self.use_mkldnn == False and self.axis == -1)
+
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.use_mkldnn == False))
+        self.check_output(
+            check_dygraph=(self.use_mkldnn == False),
+            check_eager=self.check_eager())
 
     def test_check_grad_normal(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.dtype == np.float16:
             return
         self.check_grad(
-            ['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False))
+            ['X', 'Y'],
+            'Out',
+            check_dygraph=(self.use_mkldnn == False),
+            check_eager=self.check_eager())
 
     def test_check_grad_ingore_x(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
@@ -59,7 +68,8 @@ def test_check_grad_ingore_x(self):
             ['Y'],
             'Out',
             no_grad_set=set("X"),
-            check_dygraph=(self.use_mkldnn == False))
+            check_dygraph=(self.use_mkldnn == False),
+            check_eager=self.check_eager())
 
     def test_check_grad_ingore_y(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
@@ -69,7 +79,8 @@ def test_check_grad_ingore_y(self):
             ['X'],
             'Out',
             no_grad_set=set('Y'),
-            check_dygraph=(self.use_mkldnn == False))
+            check_dygraph=(self.use_mkldnn == False),
+            check_eager=self.check_eager())
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
@@ -123,19 +134,21 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_eager=False)
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+        self.check_grad_with_place(place, ['X', 'Y'], 'Out', check_eager=False)
 
     def test_check_grad_ingore_x(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X"))
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', no_grad_set=set("X"), check_eager=False)
 
     def test_check_grad_ingore_y(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y'))
+        self.check_grad_with_place(
+            place, ['X'], 'Out', no_grad_set=set('Y'), check_eager=False)
 
 
 @skip_check_grad_ci(
@@ -586,7 +599,7 @@ def init_grad_input_output(self):
         self.grad_y = self.grad_out
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
         self.check_grad(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index a43e56b0815a6..a86758a9cb92b 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -24,6 +24,7 @@
 class ElementwiseDivOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_div"
+        self.python_api = paddle.divide
         self.dtype = np.float64
         self.init_dtype()
         """ Warning
@@ -37,8 +38,11 @@ def setUp(self):
         }
         self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
 
+    def check_eager(self):
+        return (self.use_mkldnn == False and self.axis == -1)
+
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 00967cb503fe5..b35b2840ed30a 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -23,7 +23,7 @@
 from paddle.fluid import Program, compiler, program_guard
 from paddle.fluid.op import Operator
 
-from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 
 
 class ElementwiseMulOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py
index ccbc0a1676302..7a3ae203be62d 100644
--- a/python/paddle/fluid/tests/unittests/test_exponential_op.py
+++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py
@@ -209,4 +209,5 @@ def test_fixed_random_number(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index d2bffbe074f2a..0ae005430e03b 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -213,9 +213,9 @@ def test_sharding_amp_asp_optimizer(self):
             set(parameters),
             set([
                 'fc_2.b_0', 'num_good_steps_0', 'fc_2.w_0', 'loss_scaling_0',
-                'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0_asp_mask',
-                'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0_asp_mask',
-                'fc_0.w_0_asp_mask', 'fc_1.b_0_velocity_0',
+                'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0.asp_mask',
+                'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0.asp_mask',
+                'fc_0.w_0.asp_mask', 'fc_1.b_0_velocity_0',
                 'fc_2.b_0_velocity_0'
             ]))
         self.assertEqual(ops, [
diff --git a/python/paddle/fluid/tests/unittests/test_fmin_op.py b/python/paddle/fluid/tests/unittests/test_fmin_op.py
index 5cdf096be6708..7231823c37532 100644
--- a/python/paddle/fluid/tests/unittests/test_fmin_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fmin_op.py
@@ -189,3 +189,8 @@ def test_check_grad_ingore_y(self):
         """test_check_grad_ingore_y"""
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
index 1dbc1c056128c..ac2d980f7fd38 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
@@ -22,10 +22,11 @@
 
 
 class TestGatherNdOpWithEmptyIndex(OpTest):
-    #Index has empty element, which means copy entire tensor
+    # Index has empty element, which means copy entire tensor
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         xnp = np.random.random((5, 20)).astype("float64")
         self.inputs = {'X': xnp, 'Index': np.array([[], []]).astype("int32")}
         self.outputs = {
@@ -33,24 +34,25 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=False)
 
 
 class TestGatherNdOpWithIndex1(OpTest):
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         xnp = np.random.random((5, 20)).astype("float64")
         self.inputs = {'X': xnp, 'Index': np.array([1]).astype("int32")}
         self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=False)
 
 
 class TestGatherNdOpWithLowIndex(OpTest):
@@ -58,6 +60,7 @@ class TestGatherNdOpWithLowIndex(OpTest):
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         xnp = np.random.uniform(0, 100, (10, 10)).astype("float64")
         index = np.array([[1], [2]]).astype("int64")
 
@@ -66,10 +69,10 @@ def setUp(self):
         self.outputs = {'Out': xnp[tuple(index.T)]}  #[[14, 25, 1], [76, 22, 3]]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=False)
 
 
 class TestGatherNdOpIndex1(OpTest):
@@ -77,18 +80,19 @@ class TestGatherNdOpIndex1(OpTest):
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         xnp = np.random.uniform(0, 100, (10, 10)).astype("float64")
-        index = np.array([1, 2]).astype("int64")
+        index = np.array([1, 2]).astype("int32")
 
         self.inputs = {'X': xnp, 'Index': index}
 
         self.outputs = {'Out': xnp[tuple(index.T)]}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=False)
 
 
 class TestGatherNdOpWithSameIndexAsX(OpTest):
@@ -96,6 +100,7 @@ class TestGatherNdOpWithSameIndexAsX(OpTest):
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         xnp = np.random.uniform(0, 100, (10, 10)).astype("float64")
         index = np.array([[1, 1], [2, 1]]).astype("int64")
 
@@ -103,10 +108,10 @@ def setUp(self):
         self.outputs = {'Out': xnp[tuple(index.T)]}  #[25, 22]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=False)
 
 
 class TestGatherNdOpWithHighRankSame(OpTest):
@@ -114,6 +119,7 @@ class TestGatherNdOpWithHighRankSame(OpTest):
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         shape = (5, 2, 3, 1, 10)
         xnp = np.random.rand(*shape).astype("float64")
         index = np.vstack([np.random.randint(0, s, size=2) for s in shape]).T
@@ -122,10 +128,10 @@ def setUp(self):
         self.outputs = {'Out': xnp[tuple(index.T)]}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=False)
 
 
 class TestGatherNdOpWithHighRankDiff(OpTest):
@@ -133,6 +139,7 @@ class TestGatherNdOpWithHighRankDiff(OpTest):
 
     def setUp(self):
         self.op_type = "gather_nd"
+        self.python_api = paddle.gather_nd
         shape = (2, 3, 4, 1, 10)
         xnp = np.random.rand(*shape).astype("float64")
         index = np.vstack([np.random.randint(0, s, size=200) for s in shape]).T
@@ -142,10 +149,10 @@ def setUp(self):
         self.outputs = {'Out': xnp[tuple(index.T)].reshape([20, 5, 2])}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=False)
 
 
 #Test Python API
@@ -245,4 +252,5 @@ def test_imperative(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_tree_op.py b/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
index 74e2cd9f74144..6fe68c5d34ffa 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
@@ -25,6 +25,7 @@
 class TestGatherTreeOp(OpTest):
     def setUp(self):
         self.op_type = "gather_tree"
+        self.python_api = paddle.nn.functional.gather_tree
         max_length, batch_size, beam_size = 5, 2, 2
         ids = np.random.randint(
             0, high=10, size=(max_length, batch_size, beam_size))
@@ -34,7 +35,7 @@ def setUp(self):
         self.outputs = {'Out': self.backtrace(ids, parents)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     @staticmethod
     def backtrace(ids, parents):
@@ -126,4 +127,5 @@ def test_type_parents():
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
index 68b354775d13e..30f943e3248e9 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
@@ -304,6 +304,35 @@ def test_int32_input(self):
                 "two value is\
                 {}\n{}, check diff!".format(np_res, ret_res))
 
+    def test_set_outsize_gpu(self):
+        if paddle.fluid.core.is_compiled_with_cuda():
+            x = paddle.to_tensor(
+                np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), dtype="float32")
+            src_index = paddle.to_tensor(np.array([0, 0, 1]), dtype="int32")
+            dst_index = paddle.to_tensor(np.array([0, 1, 1]), dtype="int32")
+            res = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                  "sum")
+            out_size = paddle.max(dst_index) + 1
+            res_set_outsize = paddle.incubate.graph_send_recv(
+                x, src_index, dst_index, "sum", out_size)
+
+            np_res = np.array(
+                [[0, 2, 3], [1, 6, 8], [0, 0, 0]], dtype="float32")
+            np_res_set_outsize = np.array(
+                [[0, 2, 3], [1, 6, 8]], dtype="float32")
+
+            self.assertTrue(
+                np.allclose(
+                    np_res, res, atol=1e-6),
+                "two value is\
+                {}\n{}, check diff!".format(np_res, res))
+            self.assertTrue(
+                np.allclose(
+                    np_res_set_outsize, res_set_outsize, atol=1e-6),
+                "two value is\
+                {}\n{}, check diff!"
+                .format(np_res_set_outsize, res_set_outsize))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index 44d73612b1cb5..39b79dd4ba26b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -182,7 +182,7 @@ def test_auto_prune2(self):
         self.func_auto_prune2()
 
     # TODO(jiabin): Support this when we support better split tensor
-    def test_auto_prune3(self):
+    def func_auto_prune3(self):
         with fluid.dygraph.guard():
             case3 = AutoPruneLayer3(input_size=784)
             value1 = np.arange(784).reshape(1, 784).astype("float32")
@@ -194,7 +194,12 @@ def test_auto_prune3(self):
             self.assertTrue(case3.linear.weight._grad_ivar() is not None)
             self.assertTrue((part2.gradient() == 0).all())
 
-    def test_auto_prune4(self):
+    def test_auto_prune3(self):
+        with _test_eager_guard():
+            self.func_auto_prune3()
+        self.func_auto_prune3()
+
+    def func_auto_prune4(self):
         with fluid.dygraph.guard():
             case4 = AutoPruneLayer3(input_size=784)
             value1 = np.arange(784).reshape(1, 784).astype("float32")
@@ -206,7 +211,12 @@ def test_auto_prune4(self):
             self.assertTrue(case4.linear.weight._grad_ivar() is not None)
             self.assertTrue((part2.gradient() == 1).all())
 
-    def test_auto_prune5(self):
+    def test_auto_prune4(self):
+        with _test_eager_guard():
+            self.func_auto_prune4()
+        self.func_auto_prune4()
+
+    def func_auto_prune5(self):
         with fluid.dygraph.guard():
             case4 = AutoPruneLayer3(input_size=784)
             value1 = np.arange(784).reshape(1, 784).astype("float32")
@@ -218,6 +228,11 @@ def test_auto_prune5(self):
             self.assertTrue(case4.linear.weight._grad_ivar() is not None)
             self.assertTrue((part2.gradient() == 0).all())
 
+    def test_auto_prune5(self):
+        with _test_eager_guard():
+            self.func_auto_prune5()
+        self.func_auto_prune5()
+
     def func_auto_prune6(self):
         with fluid.dygraph.guard():
             value0 = np.arange(26).reshape(2, 13).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index cd4ba5b054264..4d5f657d51e0b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,9 @@
 import unittest
 from unittest import TestCase
 import numpy as np
+import paddle.compat as cpt
+from paddle.fluid.framework import _test_eager_guard
+import paddle.fluid.core as core
 
 
 def _dygraph_guard_(func):
@@ -40,6 +43,128 @@ def random_var(size, low=-1, high=1, dtype='float32'):
     return fluid.dygraph.to_variable(x_np)
 
 
+class TestEagerGrad(TestCase):
+    def func_simple_example_eager_grad(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        out = paddle.matmul(x, y)
+        dx = fluid.dygraph.grad(out, x)
+
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
+
+    def test_simple_example_eager_grad(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad()
+        self.func_simple_example_eager_grad()
+
+    def func_simple_example_eager_grad_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        dx = fluid.dygraph.grad(out, [x, z], allow_unused=True)
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+        self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        # x is unused input in the graph
+        self.assertEqual(dx[1], None)
+
+    def test_simple_example_eager_grad_allow_unused(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_allow_unused()
+        self.func_simple_example_eager_grad_allow_unused()
+
+    def func_simple_example_eager_grad_not_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # allow_unused is false in default
+            dx = fluid.dygraph.grad(out, [x, z])
+        except ValueError as e:
+            error_msg = cpt.get_exception_message(e)
+            assert error_msg.find("allow_unused") > 0
+
+    def test_simple_example_eager_grad_not_allow_unused(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_not_allow_unused()
+        self.func_simple_example_eager_grad_not_allow_unused()
+
+    def func_simple_example_eager_grad_duplicate_input(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # duplicate input will arise RuntimeError errors
+            dx = fluid.dygraph.grad(out, [x, x])
+        except RuntimeError as e:
+            error_msg = cpt.get_exception_message(e)
+            assert error_msg.find("duplicate") > 0
+
+    def test_simple_example_eager_grad_duplicate_input(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_duplicate_input()
+        self.func_simple_example_eager_grad_duplicate_input()
+
+    def func_simple_example_eager_grad_duplicate_output(self):
+        np.random.seed(2021)
+        paddle.set_device('cpu')
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # duplicate output will arise RuntimeError errors
+            dx = fluid.dygraph.grad([out, out], [x])
+        except RuntimeError as e:
+            error_msg = cpt.get_exception_message(e)
+            assert error_msg.find("duplicate") > 0
+
+    def test_simple_example_eager_grad_duplicate_output(self):
+        with _test_eager_guard():
+            self.func_simple_example_eager_grad_duplicate_output()
+        self.func_simple_example_eager_grad_duplicate_output()
+
+
 class TestDygraphDoubleGrad(TestCase):
     def setUp(self):
         self.sort_sum_gradient = False
@@ -64,7 +189,7 @@ def grad(self,
             allow_unused=allow_unused)
 
     @dygraph_guard
-    def test_exception(self):
+    def func_exception(self):
         with self.assertRaises(AssertionError):
             self.grad(None, None)
 
@@ -93,8 +218,13 @@ def test_exception(self):
         with self.assertRaises(AssertionError):
             self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
 
+    def test_exception(self):
+        with _test_eager_guard():
+            self.func_exception()
+        self.func_exception()
+
     @dygraph_guard
-    def test_simple_example(self):
+    def func_simple_example(self):
         x = random_var(self.shape)
         x.stop_gradient = False
         y = x + 1
@@ -123,8 +253,44 @@ def test_simple_example(self):
             self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
                                 create_graph)
 
+    def test_simple_example(self):
+        with _test_eager_guard():
+            self.func_simple_example()
+        self.func_simple_example()
+
     @dygraph_guard
-    def test_none_one_initial_gradient(self):
+    def func_example_no_grad_vars(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y1 = fluid.layers.relu(x)
+        y2 = fluid.layers.relu(x)
+        z = y1 + y2
+        w = z * z
+
+        w_mean = fluid.layers.reduce_mean(w)
+        del y1, z, w
+
+        dx_actual, = self.grad(
+            [w_mean], [x], create_graph=True, no_grad_vars=[y2])
+
+        self.assertFalse(y2.stop_gradient)
+        self.assertFalse(dx_actual.stop_gradient)
+
+        dx_expected = (1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) *
+                       (x_np > 0) * 2).astype('float32')
+
+        self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
+
+    def test_example_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_no_grad_vars()
+        self.func_example_no_grad_vars()
+
+    @dygraph_guard
+    def func_none_one_initial_gradient(self):
         numel = 1
         for s in self.shape:
             numel *= s
@@ -190,8 +356,13 @@ def test_none_one_initial_gradient(self):
                             np.array_equal(grad_z.numpy(),
                                            original_random_grad_z))
 
+    def test_none_one_initial_gradient(self):
+        with _test_eager_guard():
+            self.func_none_one_initial_gradient()
+        self.func_none_one_initial_gradient()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_create_graph(self):
+    def func_example_with_gradient_accumulation_and_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -214,25 +385,33 @@ def test_example_with_gradient_accumulation_and_create_graph(self):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward(retain_graph=True)
-
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 2 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
-
-        for i in range(5):
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
             loss.backward(retain_graph=True)
+
             x_grad_actual = x.gradient()
-            x_grad_expected = (i + 2) * (2.0 / float(numel) * (
+            x_grad_expected = (2.0 / float(numel) * (
                 x_np + dx_expected *
                 (x_np > 0) * 2 / float(numel))).astype('float32')
             self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
 
+            for i in range(5):
+                loss.backward(retain_graph=True)
+                x_grad_actual = x.gradient()
+                x_grad_expected = (i + 2) * (2.0 / float(numel) * (
+                    x_np + dx_expected *
+                    (x_np > 0) * 2 / float(numel))).astype('float32')
+                self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_create_graph()
+        self.func_example_with_gradient_accumulation_and_create_graph()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+    def func_example_with_gradient_accumulation_and_no_grad_vars(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -256,17 +435,25 @@ def test_example_with_gradient_accumulation_and_no_grad_vars(self):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 4 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 4 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_no_grad_vars()
+        self.func_example_with_gradient_accumulation_and_no_grad_vars()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+    def func_example_with_gradient_accumulation_and_not_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -289,12 +476,20 @@ def test_example_with_gradient_accumulation_and_not_create_graph(self):
 
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_not_create_graph()
+        self.func_example_with_gradient_accumulation_and_not_create_graph()
 
 
 class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
@@ -304,7 +499,7 @@ def setUp(self):
 
 
 class TestDygraphDoubleGradVisitedUniq(TestCase):
-    def test_compare(self):
+    def func_compare(self):
         value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2,
                                                           5).astype("float32")
 
@@ -349,6 +544,11 @@ def model_f(input):
 
         self.assertTrue(np.array_equal(grad_1, grad_2))
 
+    def test_compare(self):
+        with _test_eager_guard():
+            self.func_compare()
+        self.func_compare()
+
 
 class TestRaiseNoDoubleGradOp(TestCase):
     def raise_no_grad_op(self):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py b/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
index 5e3d3c811882f..cd31b13083de4 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py
@@ -17,10 +17,11 @@
 import unittest
 import paddle.fluid as fluid
 import numpy as np
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestImperativePartitialBackward(unittest.TestCase):
-    def test_partitial_backward(self):
+    def func_partitial_backward(self):
         with fluid.dygraph.guard():
             x = np.random.randn(2, 4, 5).astype("float32")
             x = fluid.dygraph.to_variable(x)
@@ -49,6 +50,11 @@ def test_partitial_backward(self):
             linear1.clear_gradients()
             linear2.clear_gradients()
 
+    def test_partitial_backward(self):
+        with _test_eager_guard():
+            self.func_partitial_backward()
+        self.func_partitial_backward()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_index_sample_op.py b/python/paddle/fluid/tests/unittests/test_index_sample_op.py
index c1a8299592a2b..e2ccb153f4063 100644
--- a/python/paddle/fluid/tests/unittests/test_index_sample_op.py
+++ b/python/paddle/fluid/tests/unittests/test_index_sample_op.py
@@ -24,6 +24,7 @@
 class TestIndexSampleOp(OpTest):
     def setUp(self):
         self.op_type = "index_sample"
+        self.python_api = paddle.index_sample
         self.config()
         xnp = np.random.random(self.x_shape).astype(self.x_type)
         indexnp = np.random.randint(
@@ -39,10 +40,10 @@ def setUp(self):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=False)
 
     def config(self):
         """
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index bff10c9c4ca26..8dc822c69b2c5 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -1025,4 +1025,5 @@ def test_error(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inner.py b/python/paddle/fluid/tests/unittests/test_inner.py
index de9decd0b8961..ff9f15ebbfc82 100644
--- a/python/paddle/fluid/tests/unittests/test_inner.py
+++ b/python/paddle/fluid/tests/unittests/test_inner.py
@@ -163,4 +163,5 @@ def test_errors(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
new file mode 100644
index 0000000000000..33f55e0d51881
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
@@ -0,0 +1,400 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestDygraphInplace(unittest.TestCase):
+    def setUp(self):
+        self.init_data()
+        self.set_np_compare_func()
+
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(-5, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def set_np_compare_func(self):
+        self.np_compare = np.array_equal
+
+    def non_inplace_api_processing(self, var):
+        return paddle.squeeze(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.squeeze_(var)
+
+    def test_inplace_api(self):
+        with _test_eager_guard():
+            var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+            inplace_var = self.inplace_api_processing(var)
+            self.assertTrue(id(var) == id(inplace_var))
+
+            inplace_var.exp_()
+            self.assertTrue(np.array_equal(var.numpy(), inplace_var.numpy()))
+
+    def test_forward_version(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+                self.assertEqual(var.inplace_version, 0)
+
+                inplace_var = self.inplace_api_processing(var)
+                self.assertEqual(var.inplace_version, 1)
+
+                inplace_var.exp_()
+                self.assertEqual(var.inplace_version, 2)
+
+                inplace_var = self.inplace_api_processing(inplace_var)
+                self.assertEqual(var.inplace_version, 3)
+
+    def test_leaf_inplace_var_error(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+                var.stop_gradient = False
+
+                def leaf_inplace_error():
+                    self.inplace_api_processing(var)
+
+                self.assertRaises(ValueError, leaf_inplace_error)
+
+    def test_backward_error(self):
+        # It raises an error because the inplace operator will result
+        # in incorrect gradient computation.
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                # Here, the gradient computation will use the value of var_b
+                var_c = var_b**2
+                self.inplace_api_processing(var_b)
+
+                loss = paddle.nn.functional.relu(var_c)
+                with self.assertRaisesRegexp(
+                        RuntimeError,
+                        "received current_inplace_version:{} != inplace_version_snapshot_:{}".
+                        format(1, 0)):
+                    loss.backward()
+
+    def test_backward_success_1(self):
+        # var_b is modified inplace before using it, the inplace operator doesn't result
+        # in incorrect gradient computation.
+        grad_var_a, grad_var_a_inplace = 0, 1
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+                var_c = self.inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                # Here, the gradient computation will use the value of var_b
+                var_d = var_c**2
+                loss = var_d.sum()
+                loss.backward()
+                grad_var_a_inplace = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+                var_c = self.non_inplace_api_processing(var_b)
+                var_d = var_c**2
+                loss = var_d.sum()
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+
+        self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a))
+
+    def test_backward_success_2(self):
+        # Although var_b is modified inplace after using it, it does not used in gradient computation.
+        # The inplace operator doesn't result in incorrect gradient computation.
+        grad_var_a, grad_var_a_inplace = 0, 1
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                var_c = self.inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                var_d = var_c + var_c  # Here, the grad op of sum doesn't use the value of var_b
+                loss = var_d.sum()
+
+                loss.backward()
+                grad_var_a_inplace = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                var_c = self.non_inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                var_d = var_c + var_c  # Here, the grad op of sum doesn't use the value of var_b
+                loss = var_d.sum()
+
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+        self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
+
+
+class TestDygraphInplaceUnsqueeze(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.unsqueeze(var, -1)
+
+    def inplace_api_processing(self, var):
+        return paddle.unsqueeze_(var, -1)
+
+
+class TestDygraphInplaceReshape(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.reshape(var, [-1])
+
+    def inplace_api_processing(self, var):
+        return paddle.reshape_(var, [-1])
+
+
+class TestDygraphInplaceFlatten(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.flatten()
+
+    def inplace_api_processing(self, var):
+        return var.flatten_()
+
+
+"""
+# This case will fail while using `_C_ops.final_state_scatter`.
+class TestDygraphInplaceScatter(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+        updates = paddle.to_tensor(
+            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+
+        return paddle.scatter(var, index, updates, overwrite=False)
+
+    def inplace_api_processing(self, var):
+        index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+        updates = paddle.to_tensor(
+            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+
+        return paddle.scatter_(var, index, updates, overwrite=False)
+"""
+
+
+class TestDygraphInplaceElu(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.elu(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.elu_(var)
+
+
+class TestDygraphInplaceRelu(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.relu(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.relu_(var)
+
+
+class TestDygraphInplaceSoftmax(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.softmax(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.softmax_(var)
+
+
+class TestDygraphInplaceTanh(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.tanh(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.tanh_(var)
+
+
+class TestDygraphInplaceCeil(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.ceil()
+
+    def inplace_api_processing(self, var):
+        return var.ceil_()
+
+
+class TestDygraphInplaceFloor(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.floor()
+
+    def inplace_api_processing(self, var):
+        return var.floor_()
+
+
+class TestDygraphInplaceExp(TestDygraphInplace):
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def non_inplace_api_processing(self, var):
+        return var.exp()
+
+    def inplace_api_processing(self, var):
+        return var.exp_()
+
+
+class TestDygraphInplaceReciprocal(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.reciprocal()
+
+    def inplace_api_processing(self, var):
+        return var.reciprocal_()
+
+
+class TestDygraphInplaceRound(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.round()
+
+    def inplace_api_processing(self, var):
+        return var.round_()
+
+
+class TestDygraphInplaceSqrt(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(0, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        return var.sqrt()
+
+    def inplace_api_processing(self, var):
+        return var.sqrt_()
+
+
+class TestDygraphInplaceRsqrt(TestDygraphInplaceSqrt):
+    def non_inplace_api_processing(self, var):
+        return var.rsqrt()
+
+    def inplace_api_processing(self, var):
+        return var.rsqrt_()
+
+
+class TestDygraphInplaceClip(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.clip(0.6, 1.5)
+
+    def inplace_api_processing(self, var):
+        return var.clip_(0.6, 1.5)
+
+
+class TestDygraphInplaceScale(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.scale(scale=2.0, bias=3.0)
+
+    def inplace_api_processing(self, var):
+        return var.scale_(scale=2.0, bias=3.0)
+
+
+class TestDygraphInplaceAdd(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.rand(2, 3, 4)
+        self.dtype = "float32"
+        self.input_var_numpy_2 = np.random.rand(2, 3, 4).astype(self.dtype)
+
+    def non_inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.add(input_var_2)
+
+    def inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.add_(input_var_2)
+
+
+class TestDygraphInplaceSubtract(TestDygraphInplaceAdd):
+    def non_inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.subtract(input_var_2)
+
+    def inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.subtract_(input_var_2)
+
+
+class TestLossIsInplaceVar(unittest.TestCase):
+    def test_loss_is_inplace_var(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.ones((2, 2))
+                var_a.stop_gradient = False
+
+                var_b = var_a * 2
+                loss = var_b.tanh_()
+
+                loss.backward()
+                inplace_grad_var_a = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.ones((2, 2))
+                var_a.stop_gradient = False
+
+                var_b = var_a * 2
+                loss = var_b.tanh()
+
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+
+        self.assertTrue(np.array_equal(inplace_grad_var_a, grad_var_a))
+
+
+class TestContinuouslyInplace(unittest.TestCase):
+    def test_continuously_inplace(self):
+        with _test_eager_guard():
+            a = paddle.rand([2, 3])
+            a.stop_gradient = False
+            b = a * 2
+
+            b.reshape_([-1])
+            b.reshape_([2, 3])
+            b.reshape_([-1])
+
+            b.backward()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_io_save_load.py b/python/paddle/fluid/tests/unittests/test_io_save_load.py
index 89ca28510b9b9..83aadbf68d569 100644
--- a/python/paddle/fluid/tests/unittests/test_io_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_io_save_load.py
@@ -88,4 +88,5 @@ def test_when_train_with_no_grad(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_isclose_op.py b/python/paddle/fluid/tests/unittests/test_isclose_op.py
index aa39284d11349..2bb58d7c5741f 100644
--- a/python/paddle/fluid/tests/unittests/test_isclose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isclose_op.py
@@ -210,6 +210,9 @@ def set_args(self):
         self.atol = np.array([0]).astype("float64")
         self.equal_nan = False
 
+    def test_check_output(self):
+        self.check_output()
+
 
 class TestIscloseOpLargeDimInput(TestIscloseOp):
     def set_args(self):
@@ -222,4 +225,5 @@ def set_args(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index ca9a489c7496f..b75dc2c964ca0 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -215,6 +215,8 @@ def test_with_place(place,
                                   for name in ['x', 'scale', 'bias', 'y@GRAD']
                               },
                               fetch_list=fetch_list)
+                # print(y)
+                # print(out[0])
                 self.__assert_close(y, out[0], "y")
                 self.__assert_close(mean, out[1], "mean")
                 self.__assert_close(variance, out[2], "variance", 1e-3)
@@ -238,6 +240,7 @@ def test_with_place(place,
 
     def test_check_forward_backward_with_scale_and_bias(self):
         self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
+
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
         self.check_forward_backward(
             shape=[2, 3, 4, 5],
@@ -432,4 +435,5 @@ def test_main(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 36038d656b773..bb244a20bd873 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1819,7 +1819,7 @@ def test_row_conv(self):
 
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
-    def test_group_norm(self):
+    def func_group_norm(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
         else:
@@ -1873,7 +1873,6 @@ def test_group_norm(self):
                 with_lod=True)[0]
 
         with self.dynamic_graph():
-            # TODO(wuweilong): Add with _test_eager_guard():
             groupNorm = nn.GroupNorm(
                 channels=shape[1],
                 groups=2,
@@ -1886,6 +1885,11 @@ def test_group_norm(self):
         self.assertTrue(np.allclose(static_ret, dy_rlt_value))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
+    def test_group_norm(self):
+        with _test_eager_guard():
+            self.func_group_norm()
+        self.func_group_norm()
+
     def test_instance_norm(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
@@ -2348,7 +2352,7 @@ def test_eye_op(self):
         with self.assertRaises(TypeError):
             layers.eye(num_rows=3, batch_shape=[-1])
 
-    def test_while_loop(self):
+    def func_while_loop(self):
         with self.static_graph():
             i = layers.fill_constant(shape=[1], dtype='int64', value=0)
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
@@ -2363,7 +2367,6 @@ def body(i):
             static_ret = self.get_static_graph_result(feed={}, fetch_list=out)
 
         with self.dynamic_graph():
-            # TODO(wuweilong): Add with _test_eager_guard():
             i = layers.fill_constant(shape=[1], dtype='int64', value=0)
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
 
@@ -2384,6 +2387,11 @@ def body2(i):
 
         self.assertTrue(np.array_equal(static_ret[0], dy_ret[0].numpy()))
 
+    def test_while_loop(self):
+        with _test_eager_guard():
+            self.func_while_loop()
+        self.func_while_loop()
+
     def test_compare(self):
         value_a = np.arange(3)
         value_b = np.arange(3)
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index 16f954708d4d4..423eeaf3ada45 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -175,4 +175,5 @@ def test_errors(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 6d94144fc7788..60dd4948f996e 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -555,4 +555,5 @@ def test_linear_warmp(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index d0a40f38ba257..492f300e3b848 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -105,14 +105,14 @@ def setUp(self):
         self.outputs = {'Out': result}
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
         if core.is_compiled_with_rocm():
             self.check_grad(
-                ['X', 'Y'], 'Out', max_relative_error=1e-2, check_eager=True)
+                ['X', 'Y'], 'Out', max_relative_error=1e-2, check_eager=False)
         else:
-            self.check_grad(['X', 'Y'], 'Out', check_eager=True)
+            self.check_grad(['X', 'Y'], 'Out', check_eager=False)
 
 
 class TestMatMulOp2(TestMatMulV2Op):
@@ -346,7 +346,7 @@ def test_check_output(self):
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
                     self.check_output_with_place(
-                        place, atol=atol, check_eager=True)
+                        place, atol=atol, check_eager=False)
 
         def test_check_grad(self):
             place = core.CUDAPlace(0)
@@ -355,7 +355,7 @@ def test_check_grad(self):
                     place, ['X', 'Y'],
                     'Out',
                     max_relative_error=max_relative_error,
-                    check_eager=True)
+                    check_eager=False)
 
     cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
     TestMatMulOpFp16Case.__name__ = cls_name
@@ -534,7 +534,7 @@ def init_grad_input_output(self):
         self.grad_y = np.matmul(np.conj(self.x).T, self.grad_out)
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -551,7 +551,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             user_defined_grads=[self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=True)
+            check_eager=False)
 
     def test_check_grad_ingore_y(self):
         self.check_grad(
@@ -598,7 +598,7 @@ def init_grad_input_output(self):
                              axis=0)
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output(check_eager=False)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -606,7 +606,7 @@ def test_check_grad_normal(self):
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=True)
+            check_eager=False)
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -615,7 +615,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             user_defined_grads=[self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=True)
+            check_eager=False)
 
     def test_check_grad_ingore_y(self):
         self.check_grad(
@@ -624,7 +624,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=True)
+            check_eager=False)
 
 
 class TestMatMulTypePromotion(TestComplexMatMulOp):
diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py
index e2e118ac9e3b4..4e89a9034a341 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
@@ -19,6 +19,7 @@
 import numpy as np
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle
 
 
 def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects,
@@ -129,4 +130,5 @@ def test_errors(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_minus_op.py b/python/paddle/fluid/tests/unittests/test_minus_op.py
index 54253b17b9678..461ff6a9273cd 100644
--- a/python/paddle/fluid/tests/unittests/test_minus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minus_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 
 
 class TestMinusOp(OpTest):
@@ -36,4 +37,5 @@ def test_check_grad(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
index cdb89bb964055..a65a1c7e14c2b 100644
--- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
@@ -20,6 +20,7 @@
 from paddle.fluid import core
 from op_test import OpTest
 import numpy as np
+import os
 
 
 def sample_output_one_dimension(out, dim):
@@ -216,5 +217,59 @@ def test_dim_less_than_1():
         self.assertRaises(ValueError, test_dim_less_than_1)
 
 
+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        # Different GPU generatte different random value. Only test V100 here.
+        if not "V100" in paddle.device.cuda.get_device_name():
+            return
+
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on V100 GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(100)
+
+        x = paddle.randint(0, 100, [1024, 10000]).astype('float32')
+        y = paddle.multinomial(x, 1, replacement=False).numpy()
+        self.assertEqual(np.sum(y), 5187793)
+        self.assertEqual(np.mean(y), 5066.2041015625)
+        expect = [9982, 1655, 4741, 1323, 9319, 3298, 6473, 7477, 2507, 2628]
+        self.assertTrue(np.array_equal(y[100:110, :].flatten(), expect))
+
+        y = paddle.multinomial(x, 5000, replacement=False).numpy()
+        self.assertEqual(np.sum(y), 25603962316)
+        self.assertEqual(np.mean(y), 5000.77388984375)
+        expect = [7300, 6055, 8714, 5401, 7360, 161, 5035, 7002, 6788, 2916]
+        self.assertTrue(np.array_equal(y[100, 1000:1010], expect))
+
+        y = paddle.multinomial(x, 5000, replacement=False).numpy()
+        self.assertEqual(np.sum(y), 25592855710)
+        self.assertEqual(np.mean(y), 4998.604630859375)
+        expect = [5700, 6567, 4399, 5688, 7472, 545, 6894, 526, 2124, 385]
+        self.assertTrue(np.array_equal(y[300, 3000:3010], expect))
+
+        y = paddle.multinomial(x, 20000, replacement=True).numpy()
+        self.assertEqual(np.sum(y), 102371362581)
+        self.assertEqual(np.mean(y), 4998.60168852539)
+        self.assertEqual(np.std(y), 2886.316308500771)
+        expect = [7630, 8235, 8445, 3275, 5580, 4591, 1331, 342, 1662, 7156]
+        self.assertTrue(np.array_equal(y[100, 0:10], expect))
+
+        y = paddle.multinomial(x, 20000, replacement=True).numpy()
+        self.assertEqual(np.sum(y), 102400672117)
+        self.assertEqual(np.mean(y), 5000.032818212891)
+        self.assertEqual(np.std(y), 2886.913426124017)
+        expect = [4159, 7849, 9305, 5759, 4422, 122, 345, 2897, 5200, 5911]
+        self.assertTrue(np.array_equal(y[100, 0:10], expect))
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mv_op.py b/python/paddle/fluid/tests/unittests/test_mv_op.py
index e0d23e7871fb2..09ec702671bc9 100644
--- a/python/paddle/fluid/tests/unittests/test_mv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mv_op.py
@@ -27,15 +27,16 @@
 class TestMVOp(OpTest):
     def setUp(self):
         self.op_type = "mv"
+        self.python_api = paddle.mv
         self.init_config()
         self.inputs = {'X': self.x, 'Vec': self.vec}
         self.outputs = {'Out': np.dot(self.x, self.vec)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Vec'], 'Out')
+        self.check_grad(['X', 'Vec'], 'Out', check_eager=True)
 
     def init_config(self):
         self.x = np.random.random((2, 100)).astype("float64")
@@ -107,4 +108,5 @@ def test_shape():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index 575bc653618a5..ef912699455d1 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -588,4 +588,5 @@ def err_dtype(p, shape_x, xdtype, out=None):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_number_count_op.py b/python/paddle/fluid/tests/unittests/test_number_count_op.py
new file mode 100644
index 0000000000000..0df9d2a3a41b4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_number_count_op.py
@@ -0,0 +1,80 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import op_test
+import numpy as np
+import unittest
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.backward import append_backward
+from paddle.distributed.models.moe import utils
+
+
+def count(x, upper_range):
+    res = np.zeros((upper_range, )).astype(int)
+    for i in x.reshape(-1):
+        if i >= 0 and i < len(res):
+            res[i] += 1
+    return res
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestExpertCountOpInt64(op_test.OpTest):
+    def setUp(self):
+        expert_num = 16
+        self.op_type = "number_count"
+        x = np.random.randint(-1, expert_num, size=(1000, 2)).astype('int64')
+        self.inputs = {'gate_idx': x}
+        self.outputs = {'Out': count(x, expert_num)}
+        self.attrs = {"upper_range": expert_num}
+
+    def test_forward(self):
+        self.check_output_with_place(paddle.CUDAPlace(0))
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestExpertCountAPI(unittest.TestCase):
+    def setUp(self):
+        self.upper_range = 320
+        self.x = np.random.randint(
+            -1, self.upper_range, size=(6000, 200)).astype('int64')
+        self.out = count(self.x, self.upper_range)
+        self.place = paddle.CUDAPlace(0)
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('x', self.x.shape, dtype="int64")
+            out = utils._number_count(x, self.upper_range)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'x': self.x}, fetch_list=[out])
+            assert np.allclose(res, self.out)
+
+    def test_api_dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        out = utils._number_count(x, self.upper_range)
+        assert np.allclose(out.numpy(), self.out)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
index 66de1b309797f..fac258192112d 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
@@ -22,7 +22,8 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
-from paddle.fluid.framework import Program, program_guard
+from paddle.framework import _in_eager_mode
+from paddle.fluid.framework import Program, program_guard, _test_eager_guard
 
 
 class TestOneHotOp(OpTest):
@@ -45,7 +46,7 @@ def setUp(self):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_attr(OpTest):
@@ -68,7 +69,7 @@ def setUp(self):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_default_dtype(OpTest):
@@ -91,7 +92,7 @@ def setUp(self):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_default_dtype_attr(OpTest):
@@ -114,7 +115,7 @@ def setUp(self):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_out_of_range(OpTest):
@@ -132,7 +133,7 @@ def setUp(self):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
 
 class TestOneHotOp_exception(unittest.TestCase):
@@ -190,6 +191,12 @@ def test_api_with_dygraph(self):
             one_hot_label = fluid.one_hot(
                 input=fluid.dygraph.to_variable(label), depth=depth)
 
+            one_hot_label = paddle.nn.functional.one_hot(
+                fluid.dygraph.to_variable(label), depth)
+            with _test_eager_guard():
+                one_hot_label = paddle.nn.functional.one_hot(
+                    paddle.to_tensor(label), depth)
+
     def _run(self, depth):
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         one_hot_label = fluid.one_hot(input=label, depth=depth)
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
index 2ffe523ef6dda..531e9663a2b72 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 from unittest import TestCase
 import numpy as np
 import paddle
+from paddle.fluid.framework import _test_eager_guard
+import paddle.fluid.core as core
 
 
 def _dygraph_guard_(func):
@@ -62,7 +64,7 @@ def grad(self,
             allow_unused=allow_unused)
 
     @dygraph_guard
-    def test_exception(self):
+    def func_exception(self):
         with self.assertRaises(AssertionError):
             self.grad(None, None)
 
@@ -91,8 +93,13 @@ def test_exception(self):
         with self.assertRaises(AssertionError):
             self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
 
+    def test_exception(self):
+        with _test_eager_guard():
+            self.func_exception()
+        self.func_exception()
+
     @dygraph_guard
-    def test_simple_example(self):
+    def func_simple_example(self):
         x = random_var(self.shape)
         x.stop_gradient = False
         y = x + 1
@@ -121,8 +128,13 @@ def test_simple_example(self):
             self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
                                 create_graph)
 
+    def test_simple_example(self):
+        with _test_eager_guard():
+            self.func_simple_example()
+        self.func_simple_example()
+
     @dygraph_guard
-    def test_none_one_initial_gradient(self):
+    def func_none_one_initial_gradient(self):
         numel = 1
         for s in self.shape:
             numel *= s
@@ -188,8 +200,13 @@ def test_none_one_initial_gradient(self):
                             np.array_equal(grad_z.numpy(),
                                            original_random_grad_z))
 
+    def test_none_one_initial_gradient(self):
+        with _test_eager_guard():
+            self.func_none_one_initial_gradient()
+        self.func_none_one_initial_gradient()
+
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_create_graph(self):
+    def func_example_with_gradient_accumulation_and_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -212,17 +229,25 @@ def test_example_with_gradient_accumulation_and_create_graph(self):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 2 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 2 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_create_graph()
+        self.func_example_with_gradient_accumulation_and_create_graph()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+    def func_example_with_gradient_accumulation_and_no_grad_vars(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -246,17 +271,25 @@ def test_example_with_gradient_accumulation_and_no_grad_vars(self):
                        (x_np > 0) * 2).astype('float32')
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
+
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 / float(numel) * (
+                x_np + dx_expected *
+                (x_np > 0) * 4 / float(numel))).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 / float(numel) *
-                           (x_np + dx_expected *
-                            (x_np > 0) * 4 / float(numel))).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_no_grad_vars()
+        self.func_example_with_gradient_accumulation_and_no_grad_vars()
 
     @dygraph_guard
-    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+    def func_example_with_gradient_accumulation_and_not_create_graph(self):
         x = random_var(self.shape)
         x_np = x.numpy()
         numel = x_np.size
@@ -279,12 +312,20 @@ def test_example_with_gradient_accumulation_and_not_create_graph(self):
 
         self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
 
-        loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
-        loss.backward()
+        if core._in_eager_mode():
+            pass
+        else:
+            loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
+            loss.backward()
 
-        x_grad_actual = x.gradient()
-        x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
-        self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+            x_grad_actual = x.gradient()
+            x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
+            self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
+
+    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+        with _test_eager_guard():
+            self.func_example_with_gradient_accumulation_and_not_create_graph()
+        self.func_example_with_gradient_accumulation_and_not_create_graph()
 
 
 class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 9e0cf6ddef2d6..8945d35c131fd 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -315,7 +315,9 @@ def test_single_pickle_var_dygraph(self):
         paddle.save(tensor, path)
         t_dygraph = paddle.load(path)
         np_dygraph = paddle.load(path, return_numpy=True)
-        self.assertTrue(isinstance(t_dygraph, paddle.fluid.core.VarBase))
+        self.assertTrue(
+            isinstance(t_dygraph, (paddle.fluid.core.VarBase,
+                                   paddle.fluid.core.eager.Tensor)))
         self.assertTrue(np.array_equal(tensor.numpy(), np_dygraph))
         self.assertTrue(np.array_equal(tensor.numpy(), t_dygraph.numpy()))
         paddle.enable_static()
@@ -685,27 +687,34 @@ def test_save_load_complex_object_static_save(self):
                         np.array(v), np.array(load_tensor2['k2'][k])))
             self.assertTrue(load_tensor2['epoch'] == 123)
 
-            self.assertTrue(isinstance(load_tensor3[0], fluid.core.VarBase))
+            self.assertTrue(
+                isinstance(load_tensor3[0], (fluid.core.VarBase,
+                                             fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor3[0].numpy(), obj3[0]))
-            self.assertTrue(isinstance(load_tensor3[1], fluid.core.VarBase))
+            self.assertTrue(
+                isinstance(load_tensor3[1], (fluid.core.VarBase,
+                                             fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor3[1].numpy(), obj3[1]))
 
             for k, v in state_dict.items():
                 self.assertTrue(
-                    isinstance(load_tensor3[2]["state_dict"][k],
-                               fluid.core.VarBase))
+                    isinstance(load_tensor3[2]["state_dict"][k], (
+                        fluid.core.VarBase, fluid.core.eager.Tensor)))
                 self.assertTrue(
                     np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
                                    np.array(v)))
 
             for k, v in state_dict.items():
                 self.assertTrue(
-                    isinstance(load_tensor3[2]["opt"][k], fluid.core.VarBase))
+                    isinstance(load_tensor3[2]["opt"][k], (
+                        fluid.core.VarBase, fluid.core.eager.Tensor)))
                 self.assertTrue(
                     np.array_equal(load_tensor3[2]["opt"][k].numpy(),
                                    np.array(v)))
 
-            self.assertTrue(isinstance(load_tensor4[0], fluid.core.VarBase))
+            self.assertTrue(
+                isinstance(load_tensor4[0], (fluid.core.VarBase,
+                                             fluid.core.eager.Tensor)))
             self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
 
             load_array1 = paddle.load(path1, return_numpy=True)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
index 802fcc96288f6..2530fc07753e8 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -205,5 +205,10 @@ def test_multiple_gpus_dynamic(self):
         self.run_mnist_2gpu('parallel_dygraph_dataparallel_in_eager_mode.py')
 
 
+class TestGradientCheckInEagerMode(TestMultipleGpus):
+    def test_multiple_gpus_dynamic(self):
+        self.run_mnist_2gpu('parallel_dygraph_gradient_check_in_eager_mode.py')
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
index 838ccae37cfa5..73b501c9c7ead 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -56,7 +56,15 @@ def test_statistic_case1(self):
         mobilenet_node = HostPythonNode(
             'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
         yolonet_node = HostPythonNode(
-            'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001)
+            'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
+
+        userdefined_node = HostPythonNode('Communication Time',
+                                          profiler.TracerEventType.UserDefined,
+                                          100, 110, 1000, 1001)
+
+        communication_node = HostPythonNode(
+            'Communication', profiler.TracerEventType.Communication, 105, 110,
+            1000, 1001)
         backward_node = HostPythonNode('Gradient Backward',
                                        profiler.TracerEventType.Backward, 120,
                                        200, 1000, 1001)
@@ -114,7 +122,9 @@ def test_statistic_case1(self):
             optimization_node
         ])
         mobilenet_node.children_node.append(conv2d_node)
-        yolonet_node.children_node.append(sync_batch_norm_node)
+        yolonet_node.children_node.extend(
+            [sync_batch_norm_node, userdefined_node])
+        userdefined_node.children_node.append(communication_node)
         conv2d_node.children_node.extend(
             [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
         conv2d_compute.runtime_node.append(conv2d_launchkernel)
@@ -145,7 +155,7 @@ def test_statistic_case1(self):
                 profiler.TracerEventType.ProfileStep), 400)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.Forward), 90)
+                profiler.TracerEventType.Forward), 100)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
                 profiler.TracerEventType.Backward), 80)
@@ -169,15 +179,18 @@ def test_statistic_case1(self):
                 0, profiler.TracerEventType.Memcpy), 60)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.UserDefined), 15)
+                profiler.TracerEventType.UserDefined), 25)
+        self.assertEqual(
+            time_range_summary.get_cpu_range_sum(
+                profiler.TracerEventType.Communication), 5)
         self.assertEqual(len(event_summary.items), 2)
-        self.assertEqual(len(event_summary.userdefined_items), 0)
+        self.assertEqual(len(event_summary.userdefined_items), 1)
         self.assertEqual(len(event_summary.model_perspective_items), 3)
         self.assertEqual(len(event_summary.memory_manipulation_items), 1)
         self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
         self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
         self.assertEqual(
-            event_summary.model_perspective_items['Forward'].cpu_time, 90)
+            event_summary.model_perspective_items['Forward'].cpu_time, 100)
         self.assertEqual(
             event_summary.model_perspective_items['Forward'].gpu_time, 135)
         self.assertEqual(
diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py
index 4361a45f1568f..2380ccb14aaee 100644
--- a/python/paddle/fluid/tests/unittests/test_randperm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py
@@ -18,6 +18,7 @@
 import paddle
 import paddle.fluid.core as core
 from paddle.static import program_guard, Program
+import os
 
 
 def check_randperm_out(n, data_np):
@@ -129,5 +130,81 @@ def test_out(self):
         paddle.enable_static()
 
 
+class TestRandomValue(unittest.TestCase):
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
+            return
+
+        print("Test Fixed Random number on GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(2021)
+
+        x = paddle.randperm(30000, dtype='int32').numpy()
+        expect = [
+            24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='int64').numpy()
+        expect = [
+            6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='float32').numpy()
+        expect = [
+            5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144.,
+            22906., 10705.
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564.,
+            26991.
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339.,
+            4662.
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='float64').numpy()
+        expect = [
+            19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147.,
+            9163.
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945.,
+            17624.
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202.,
+            21404.
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_renorm_op.py b/python/paddle/fluid/tests/unittests/test_renorm_op.py
index 3ea2002a9786f..e00a892cf7197 100644
--- a/python/paddle/fluid/tests/unittests/test_renorm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_renorm_op.py
@@ -54,7 +54,7 @@ def test_renorm_api(self):
     def test_dygraph_api(self):
         self.input_data()
         # case axis none
-        with fluid.dygraph.guard():
+        with fluid.dygraph.guard(fluid.CPUPlace()):
             input = [[[2.0, 2, -2], [3, 0.3, 3]], [[2, -8, 2], [3.1, 3.7, 3]]]
             x = paddle.to_tensor(input, stop_gradient=False)
             y = paddle.renorm(x, 1.0, 2, 2.05)
@@ -94,4 +94,5 @@ def test_dygraph_api(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
index ca324b4a8fd05..1bfc1b00aa822 100644
--- a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
+++ b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
@@ -23,6 +23,7 @@
 from test_multiclass_nms_op import nms
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+import paddle
 
 
 def multiclass_nms(prediction, class_num, keep_top_k, nms_threshold):
@@ -518,4 +519,5 @@ def test_iminfo_tensor_dtype():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
new file mode 100644
index 0000000000000..a2f12fbf5809b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import subprocess
+import sys, os
+import json
+import shutil
+
+import random
+
+from os import listdir
+from os.path import isfile, join
+
+pyname = 'train.py'
+colpyfile = '''# train.py for unitest
+import os
+env = os.environ.copy()
+assert "PADDLE_MASTER" in env
+assert "PADDLE_GLOBAL_SIZE" in env
+assert "PADDLE_LOCAL_SIZE" in env
+assert "PADDLE_GLOBAL_RANK" in env
+assert "PADDLE_LOCAL_RANK" in env
+'''
+
+pspyfile = '''# train.py for unitest
+import os
+env = os.environ.copy()
+assert "PADDLE_PSERVERS_IP_PORT_LIST" in env
+assert "PADDLE_TRAINER_ENDPOINTS" in env
+assert "PADDLE_ROLE" in env
+#assert "PADDLE_RANK" in env
+'''
+
+
+def write_file(name, ct):
+    with open(name, "w") as f:
+        f.write(ct)
+
+
+def get_files(pth, prefix):
+    return [
+        f for f in listdir(pth) if isfile(join(pth, f)) and f.startswith(prefix)
+    ]
+
+
+class Collective_Test(unittest.TestCase):
+    def setUp(self):
+        write_file(pyname, colpyfile)
+
+    def pdrun(self, args, env=None):
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"]
+        if args:
+            cmd.extend(args.split(" "))
+        cmd.extend([pyname])
+        proc = subprocess.Popen(cmd, env)
+        return proc
+
+    def test_collective_1(self):
+        args = "--job_id test1"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+    def test_collective_2(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--job_id test2 --devices 0,1,2"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+        c = get_files('log', 'test2')
+        self.assertTrue(len(c) == 4)
+
+    def test_collective_3(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        port = random.randrange(6000, 8000)
+        args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
+            port)
+        p1 = self.pdrun(args)
+        p2 = self.pdrun(args)
+        p1.wait()
+        p2.wait()
+        self.assertTrue(p1.poll() == 0)
+        self.assertTrue(p2.poll() == 0)
+
+        c = get_files('log', 'test3')
+        self.assertTrue(len(c) == 6)
+
+
+class PS_Test(unittest.TestCase):
+    def setUp(self):
+        write_file(pyname, pspyfile)
+
+    def pdrun(self, args, env=None):
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"]
+        if args:
+            cmd.extend(args.split(" "))
+        cmd.extend([pyname])
+        proc = subprocess.Popen(cmd, env)
+        return proc
+
+    def test_ps_1(self):
+        args = "--run_mode ps"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+    def test_ps_2(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--job_id ps2 --server_num=2 --trainer_num=2"
+        p = self.pdrun(args)
+        p.wait()
+        self.assertTrue(p.poll() == 0)
+
+        c = get_files('log', 'ps2')
+        self.assertTrue(len(c) == 5)
+
+    def test_ps_3(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        port = random.randrange(6000, 8000)
+        args = "--job_id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
+            port)
+        p1 = self.pdrun(args)
+        p2 = self.pdrun(args)
+        p1.wait()
+        p2.wait()
+        self.assertTrue(p1.poll() == 0)
+        self.assertTrue(p2.poll() == 0)
+
+        c = get_files('log', 'ps3')
+        self.assertTrue(len(c) == 6)
+
+    def test_ps_4(self):
+        if os.path.exists('./log'):
+            shutil.rmtree('./log')
+
+        args = "--job_id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
+        p1 = self.pdrun(args)
+        p1.wait()
+        self.assertTrue(p1.poll() == 0)
+
+        c = get_files('log', 'ps4')
+        self.assertTrue(len(c) == 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
index 418155a865cb8..d7a27bbddebba 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
@@ -67,6 +67,7 @@ class TestScatterNdAddSimpleOp(OpTest):
 
     def setUp(self):
         self.op_type = "scatter_nd_add"
+        self.python_api = paddle.scatter_nd_add
         ref_np = np.random.random([100]).astype("float64")
         index_np = np.random.randint(0, 100, [100, 1]).astype("int32")
         updates_np = np.random.random([100]).astype("float64")
@@ -76,10 +77,10 @@ def setUp(self):
         self.outputs = {'Out': expect_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Updates'], 'Out')
+        self.check_grad(['X', 'Updates'], 'Out', check_eager=False)
 
 
 class TestScatterNdAddWithEmptyIndex(OpTest):
@@ -89,6 +90,7 @@ class TestScatterNdAddWithEmptyIndex(OpTest):
 
     def setUp(self):
         self.op_type = "scatter_nd_add"
+        self.python_api = paddle.scatter_nd_add
         ref_np = np.random.random((10, 10)).astype("float64")
         index_np = np.array([[], []]).astype("int32")
         updates_np = np.random.random((2, 10, 10)).astype("float64")
@@ -99,10 +101,10 @@ def setUp(self):
         self.outputs = {'Out': expect_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Updates'], 'Out')
+        self.check_grad(['X', 'Updates'], 'Out', check_eager=False)
 
 
 class TestScatterNdAddWithHighRankSame(OpTest):
@@ -112,6 +114,7 @@ class TestScatterNdAddWithHighRankSame(OpTest):
 
     def setUp(self):
         self.op_type = "scatter_nd_add"
+        self.python_api = paddle.scatter_nd_add
         shape = (3, 2, 2, 1, 10)
         ref_np = np.random.rand(*shape).astype("float64")
         index_np = np.vstack(
@@ -125,10 +128,10 @@ def setUp(self):
         self.outputs = {'Out': expect_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Updates'], 'Out')
+        self.check_grad(['X', 'Updates'], 'Out', check_eager=False)
 
 
 class TestScatterNdAddWithHighRankDiff(OpTest):
@@ -138,6 +141,7 @@ class TestScatterNdAddWithHighRankDiff(OpTest):
 
     def setUp(self):
         self.op_type = "scatter_nd_add"
+        self.python_api = paddle.scatter_nd_add
         shape = (8, 2, 2, 1, 10)
         ref_np = np.random.rand(*shape).astype("double")
         index = np.vstack([np.random.randint(0, s, size=500) for s in shape]).T
@@ -150,10 +154,10 @@ def setUp(self):
         self.outputs = {'Out': expect_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Updates'], 'Out')
+        self.check_grad(['X', 'Updates'], 'Out', check_eager=False)
 
 
 #Test Python API
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
index ad542da781670..d7f8886dcd3c1 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -27,6 +27,7 @@
 class TestScatterOp(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 50)).astype("float32")
         index_np = np.array([1, 2]).astype("int32")
         updates_np = np.random.random((2, 50)).astype("float32")
@@ -36,15 +37,16 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(["X", "Updates"], "Out")
+        self.check_grad(["X", "Updates"], "Out", check_eager=False)
 
 
 class TestScatterOp0(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         index_np = np.array([1, 2]).astype("int32")
         updates_np = np.random.random((2, 3)).astype("float32")
@@ -55,15 +57,16 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(["X", "Updates"], "Out")
+        self.check_grad(["X", "Updates"], "Out", check_eager=False)
 
 
 class TestScatterOp1(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         zeros_np = np.zeros([2, 3]).astype('float32')
         index_np = np.array([1, 1]).astype("int32")
@@ -77,10 +80,10 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(["X", "Updates"], "Out")
+        self.check_grad(["X", "Updates"], "Out", check_eager=False)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -88,6 +91,7 @@ def test_check_grad(self):
 class TestScatterOp2(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         index_np = np.array([1, 2]).astype("int32")
         updates_np = np.random.random((2, 3)).astype("float32")
@@ -99,12 +103,13 @@ def setUp(self):
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
+            self.check_output_with_place(place, atol=1e-3, check_eager=False)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['X', 'Updates'], 'Out')
+            self.check_grad_with_place(
+                place, ['X', 'Updates'], 'Out', check_eager=False)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -112,6 +117,7 @@ def test_check_grad(self):
 class TestScatterOp3(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         zeros_np = np.zeros([2, 3]).astype('float32')
         index_np = np.array([1, 1]).astype("int32")
@@ -127,17 +133,19 @@ def setUp(self):
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
+            self.check_output_with_place(place, atol=1e-3, check_eager=False)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['X', 'Updates'], 'Out')
+            self.check_grad_with_place(
+                place, ['X', 'Updates'], 'Out', check_eager=False)
 
 
 class TestScatterOp4(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         index_np = np.array([1, 2]).astype("int64")
         updates_np = np.random.random((2, 3)).astype("float32")
@@ -147,10 +155,10 @@ def setUp(self):
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Updates'], 'Out')
+        self.check_grad(['X', 'Updates'], 'Out', check_eager=False)
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -158,6 +166,7 @@ def test_check_grad(self):
 class TestScatterOp5(OpTest):
     def setUp(self):
         self.op_type = "scatter"
+        self.python_api = paddle.scatter
         ref_np = np.ones((3, 3)).astype("float32")
         index_np = np.array([1, 2]).astype("int64")
         updates_np = np.random.random((2, 3)).astype("float32")
@@ -169,12 +178,13 @@ def setUp(self):
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
+            self.check_output_with_place(place, atol=1e-3, check_eager=False)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['X', 'Updates'], 'Out')
+            self.check_grad_with_place(
+                place, ['X', 'Updates'], 'Out', check_eager=False)
 
 
 class TestScatterAPI(unittest.TestCase):
@@ -274,6 +284,7 @@ def test_static_graph():
 class TestScatterOpFp16(OpTest):
     def setUp(self):
         self.__class__.op_type = "scatter"
+        self.python_api = paddle.scatter
         # compute grad in the following code handly.
         self.__class__.no_need_check_grad = True
         self.x_type = 'float16'
diff --git a/python/paddle/fluid/tests/unittests/test_scope.py b/python/paddle/fluid/tests/unittests/test_scope.py
index aa093069c49ec..805aabd393e49 100644
--- a/python/paddle/fluid/tests/unittests/test_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_scope.py
@@ -59,6 +59,13 @@ def test_scope_pool(self):
             # It is not allowed to delete a nonexistent scope.
             scope._remove_from_pool()
 
+    def test_size(self):
+        paddle_c = paddle.fluid.core
+        scope = paddle_c.Scope()
+        var_a = scope.var("var_a")
+        self.assertEqual(scope.size(), 1)
+        self.assertIsNotNone(scope.find_var('var_a'))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 42225468bc41c..f7b145d358ec9 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -22,6 +22,7 @@
 import paddle
 from paddle.fluid.layer_helper import LayerHelper
 from functools import reduce
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class TestSetValueBase(unittest.TestCase):
@@ -69,7 +70,7 @@ def _run_dynamic(self):
         paddle.enable_static()
         return out
 
-    def test_api(self):
+    def func_test_api(self):
         static_out = self._run_static()
         dynamic_out = self._run_dynamic()
         self._get_answer()
@@ -82,6 +83,11 @@ def test_api(self):
             (self.data == dynamic_out).all(),
             msg=error_msg.format("dynamic", self.data, dynamic_out))
 
+    def test_api(self):
+        with _test_eager_guard():
+            self.func_test_api()
+        self.func_test_api()
+
 
 # 1. Test different type of item: int, Python slice, Paddle Tensor
 # 1.1 item is int
@@ -995,9 +1001,9 @@ def test_static(self):
             fetch_list=[var.name + "@GRAD", z.name + "@GRAD"])
 
         self.assertTrue((var_grad == z_grad[0, :]).all())
-
-    def test_dynamic(self):
         paddle.disable_static()
+
+    def func_test_dynamic(self):
         model = Model()
         x = paddle.ones([1, 12, 3, 3]).astype("float32")
         y = paddle.ones([1, 12, 3, 3]).astype("float32")
@@ -1006,11 +1012,18 @@ def test_dynamic(self):
 
         self.assertTrue(var.grad.shape == x.grad[0, :, 0, 0].shape)
         # 
-        self.assertTrue((0 == x.grad[0, :, 0, 0]).all())
+        # TODO(pangyoki) add inplace and delete if
+        if not _in_eager_mode():
+            self.assertTrue((0 == x.grad[0, :, 0, 0]).all())
+
+    def test_dynamic(self):
+        with _test_eager_guard():
+            self.func_test_dynamic()
+        self.func_test_dynamic()
 
 
 class TestGradientTruncated(unittest.TestCase):
-    def test_consistent_with_competitor(self):
+    def func_test_consistent_with_competitor(self):
         paddle.disable_static()
 
         def set_value(t, value):
@@ -1182,6 +1195,11 @@ def set_value5(t, value):
         self.assertTrue(~x.stop_gradient)
         self.assertTrue(~x.is_leaf)
 
+    def test_consistent_with_competitor(self):
+        with _test_eager_guard():
+            self.func_test_consistent_with_competitor()
+        self.func_test_consistent_with_competitor()
+
     def test_static_graph(self):
         paddle.enable_static()
 
@@ -1328,6 +1346,7 @@ def set_value(array, i, op):
                 self.assertTrue((numel(out1[0][0:5:3].shape) == out3[0]).all())
 
             array = array[0]
+        paddle.disable_static()
 
 
 class TestSetValueInplace(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
index 9a97f57aaae5f..74409c8671059 100644
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss.py
@@ -178,4 +178,5 @@ def test_smooth_l1_loss_delta(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
new file mode 100644
index 0000000000000..8284771920e81
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestSparseUtils(unittest.TestCase):
+    def test_to_sparse_coo(self):
+        with _test_eager_guard():
+            x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
+            non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
+            non_zero_elements = [1, 2, 3, 4, 5]
+            dense_x = paddle.to_tensor(x)
+            #TODO(zhangkaihuo): change to test the corresponding API
+            out = _C_ops.final_state_to_sparse_coo(dense_x, 2)
+            print(out)
+            assert np.array_equal(out.non_zero_indices().numpy(),
+                                  non_zero_indices)
+            assert np.array_equal(out.non_zero_elements().numpy(),
+                                  non_zero_elements)
+
+            dense_tensor = _C_ops.final_state_to_dense(out)
+            assert np.array_equal(dense_tensor.numpy(), x)
+
+    def test_to_sparse_csr(self):
+        with _test_eager_guard():
+            x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
+            non_zero_crows = [0, 2, 3, 5]
+            non_zero_cols = [1, 3, 2, 0, 1]
+            non_zero_elements = [1, 2, 3, 4, 5]
+            dense_x = paddle.to_tensor(x)
+            out = _C_ops.final_state_to_sparse_csr(dense_x)
+            print(out)
+            assert np.array_equal(out.non_zero_crows().numpy(), non_zero_crows)
+            assert np.array_equal(out.non_zero_cols().numpy(), non_zero_cols)
+            assert np.array_equal(out.non_zero_elements().numpy(),
+                                  non_zero_elements)
+
+            dense_tensor = _C_ops.final_state_to_dense(out)
+            assert np.array_equal(dense_tensor.numpy(), x)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_stft_op.py b/python/paddle/fluid/tests/unittests/test_stft_op.py
new file mode 100644
index 0000000000000..64b8084a1651f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_stft_op.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from numpy.lib.stride_tricks import as_strided
+import paddle
+import unittest
+
+from op_test import OpTest
+
+
+def frame_from_librosa(x, frame_length, hop_length, axis=-1):
+    if axis == -1 and not x.flags["C_CONTIGUOUS"]:
+        x = np.ascontiguousarray(x)
+    elif axis == 0 and not x.flags["F_CONTIGUOUS"]:
+        x = np.asfortranarray(x)
+
+    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
+    strides = np.asarray(x.strides)
+
+    if axis == -1:
+        shape = list(x.shape)[:-1] + [frame_length, n_frames]
+        strides = list(strides) + [hop_length * x.itemsize]
+
+    elif axis == 0:
+        shape = [n_frames, frame_length] + list(x.shape)[1:]
+        strides = [hop_length * x.itemsize] + list(strides)
+
+    else:
+        raise ValueError("Frame axis={} must be either 0 or -1".format(axis))
+
+    return as_strided(x, shape=shape, strides=strides)
+
+
+def stft_np(x, n_fft, hop_length, **kwargs):
+    frames = frame_from_librosa(x, n_fft, hop_length)
+    res = np.fft.rfft(frames, axis=1)
+    return res
+
+
+class TestStftOp(OpTest):
+    def setUp(self):
+        self.op_type = "stft"
+        self.shape, self.type, self.attrs = self.initTestCase()
+        self.inputs = {
+            'X': np.random.random(size=self.shape).astype(self.type),
+        }
+        self.outputs = {'Out': stft_np(x=self.inputs['X'], **self.attrs)}
+
+    def initTestCase(self):
+        input_shape = (2, 100)
+        input_type = 'float64'
+        attrs = {
+            'n_fft': 50,
+            'hop_length': 15,
+            'normalized': False,
+            'onesided': True,
+        }
+        return input_shape, input_type, attrs
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+        paddle.disable_static()
+
+    def test_check_grad_normal(self):
+        paddle.enable_static()
+        self.check_grad(['X'], 'Out')
+        paddle.disable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index aac8b6a99b649..086527ab55435 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -20,6 +20,8 @@
 import paddle
 import paddle.nn as nn
 from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
+import paddle.fluid as fluid
+import paddle.fluid.core as core
 
 
 class SimpleNet(nn.Layer):
@@ -445,8 +447,7 @@ def test_multiple_hooks_for_interior_var(self):
             self.func_multiple_hooks_for_interior_var()
         self.func_multiple_hooks_for_interior_var()
 
-    # TODO(wuweilong): enable this case when DoubleGrad in eager mode is ready
-    def test_hook_in_double_grad(self):
+    def func_hook_in_double_grad(self):
         def double_print_hook(grad):
             grad = grad * 2
             print(grad)
@@ -461,10 +462,11 @@ def double_print_hook(grad):
         x.register_hook(double_print_hook)
 
         y = x * x
-
+        fluid.set_flags({'FLAGS_retain_grad_for_all_tensor': False})
         # Since y = x * x, dx = 2 * x
         dx = paddle.grad(
             outputs=[y], inputs=[x], create_graph=True, retain_graph=True)[0]
+        fluid.set_flags({'FLAGS_retain_grad_for_all_tensor': True})
 
         z = y + dx
         self.assertTrue(x.grad is None)
@@ -475,8 +477,17 @@ def double_print_hook(grad):
         # x.gradient() = 2 * x + 2 = 4.0
         # after changed by hook: 8.0
 
-        z.backward()
-        self.assertTrue(np.array_equal(x.grad.numpy(), np.array([8.])))
+        # TODO(wuweilong): enable this case when DoubleGrad in eager mode is ready
+        if core._in_eager_mode():
+            pass
+        else:
+            z.backward()
+            self.assertTrue(np.array_equal(x.grad.numpy(), np.array([8.])))
+
+    def test_hook_in_double_grad(self):
+        with _test_eager_guard():
+            self.func_hook_in_double_grad()
+        self.func_hook_in_double_grad()
 
     def func_remove_one_hook_multiple_times(self):
         for device in self.devices:
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
index b0f065a26a006..8359141f309f5 100644
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -22,7 +22,7 @@
 from paddle.fluid import compiler, Program, program_guard
 
 
-# Situation 1: repeat_times is a list (without tensor)
+#Situation 1: repeat_times is a list (without tensor)
 class TestTileOpRank1(OpTest):
     def setUp(self):
         self.op_type = "tile"
@@ -248,4 +248,5 @@ def test_api(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 1e6b4354dd9c8..c890c3c607cb0 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -29,6 +29,7 @@ class TestTransposeOp(OpTest):
     def setUp(self):
         self.init_op_type()
         self.initTestCase()
+        self.python_api = paddle.transpose
         self.inputs = {'X': np.random.random(self.shape).astype("float64")}
         self.attrs = {
             'axis': list(self.axis),
@@ -44,10 +45,10 @@ def init_op_type(self):
         self.use_mkldnn = False
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
+        self.check_output(no_check_set=['XShape'], check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
     def initTestCase(self):
         self.shape = (3, 40)
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index b70fa04adc13c..5bb3e99ee302f 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -29,6 +29,7 @@
 class TestTruncOp(OpTest):
     def setUp(self):
         self.op_type = "trunc"
+        self.python_api = paddle.trunc
         self.dtype = np.float64
         np.random.seed(2021)
         self.inputs = {'X': np.random.random((20, 20)).astype(self.dtype)}
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index dbd40c349bbc8..4b3e935426f9f 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -22,6 +22,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
 
 
 class TestVarBase(unittest.TestCase):
@@ -874,7 +875,7 @@ def _test_list_index(self):
         col = np.array([2, 1, 3])
         self.assertTrue(np.array_equal(array[row, col], x[row, col].numpy()))
 
-    def test_slice(self):
+    def func_test_slice(self):
         with fluid.dygraph.guard():
             self._test_slice()
             self._test_slice_for_tensor_attr()
@@ -899,6 +900,11 @@ def test_slice(self):
                 mask = np.array([1, 0, 1, 0], dtype=bool)
                 var[paddle.to_tensor([0, 1]), mask]
 
+    def test_slice(self):
+        with _test_eager_guard():
+            self.func_test_slice()
+        self.func_test_slice()
+
     def test_var_base_to_np(self):
         with fluid.dygraph.guard():
             var = fluid.dygraph.to_variable(self.array)
@@ -1125,7 +1131,6 @@ def test_print_tensor_dtype(self):
 
 class TestVarBaseSetitem(unittest.TestCase):
     def setUp(self):
-        paddle.disable_static()
         self.set_dtype()
         self.tensor_x = paddle.to_tensor(np.ones((4, 2, 3)).astype(self.dtype))
         self.np_value = np.random.random((2, 3)).astype(self.dtype)
@@ -1135,12 +1140,13 @@ def set_dtype(self):
         self.dtype = "int32"
 
     def _test(self, value):
-        paddle.disable_static()
-        self.assertEqual(self.tensor_x.inplace_version, 0)
+        if not _in_eager_mode():
+            self.assertEqual(self.tensor_x.inplace_version, 0)
 
         id_origin = id(self.tensor_x)
         self.tensor_x[0] = value
-        self.assertEqual(self.tensor_x.inplace_version, 1)
+        if not _in_eager_mode():
+            self.assertEqual(self.tensor_x.inplace_version, 1)
 
         if isinstance(value, (six.integer_types, float)):
             result = np.zeros((2, 3)).astype(self.dtype) + value
@@ -1152,27 +1158,47 @@ def _test(self, value):
         self.assertEqual(id_origin, id(self.tensor_x))
 
         self.tensor_x[1:2] = value
-        self.assertEqual(self.tensor_x.inplace_version, 2)
+        if not _in_eager_mode():
+            self.assertEqual(self.tensor_x.inplace_version, 2)
         self.assertTrue(np.array_equal(self.tensor_x[1].numpy(), result))
         self.assertEqual(id_origin, id(self.tensor_x))
 
         self.tensor_x[...] = value
-        self.assertEqual(self.tensor_x.inplace_version, 3)
+        if not _in_eager_mode():
+            self.assertEqual(self.tensor_x.inplace_version, 3)
         self.assertTrue(np.array_equal(self.tensor_x[3].numpy(), result))
         self.assertEqual(id_origin, id(self.tensor_x))
 
-    def test_value_tensor(self):
-        paddle.disable_static()
+    def func_test_value_tensor(self):
         self._test(self.tensor_value)
 
-    def test_value_numpy(self):
-        paddle.disable_static()
+    def test_value_tensor(self):
+        with _test_eager_guard():
+            self.setUp()
+            self.func_test_value_tensor()
+        self.setUp()
+        self.func_test_value_tensor()
+
+    def func_test_value_numpy(self):
         self._test(self.np_value)
 
-    def test_value_int(self):
-        paddle.disable_static()
+    def test_value_numpy(self):
+        with _test_eager_guard():
+            self.setUp()
+            self.func_test_value_numpy()
+        self.setUp()
+        self.func_test_value_numpy()
+
+    def func_test_value_int(self):
         self._test(10)
 
+    def test_value_int(self):
+        with _test_eager_guard():
+            self.setUp()
+            self.func_test_value_int()
+        self.setUp()
+        self.func_test_value_int()
+
 
 class TestVarBaseSetitemInt64(TestVarBaseSetitem):
     def set_dtype(self):
@@ -1361,4 +1387,5 @@ def test_copy_gradient_from(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index a3bfe3864a249..beaf361379b94 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -333,7 +333,8 @@ def _test_slice_index_list_bool(self, place):
         with self.assertRaises(IndexError):
             res = x[[True, False, False]]
         with self.assertRaises(ValueError):
-            res = x[[False, False]]
+            with paddle.static.program_guard(prog):
+                res = x[[False, False]]
 
     def test_slice(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py
index 7fb4d39cd7338..36819e089edbf 100644
--- a/python/paddle/fluid/tests/unittests/test_where_op.py
+++ b/python/paddle/fluid/tests/unittests/test_where_op.py
@@ -29,15 +29,16 @@
 class TestWhereOp(OpTest):
     def setUp(self):
         self.op_type = 'where'
+        self.python_api = paddle.where
         self.init_config()
         self.inputs = {'Condition': self.cond, 'X': self.x, 'Y': self.y}
         self.outputs = {'Out': np.where(self.cond, self.x, self.y)}
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output(check_eager=False)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out', check_eager=True)
+        self.check_grad(['X', 'Y'], 'Out', check_eager=False)
 
     def init_config(self):
         self.x = np.random.uniform((-3), 5, 100).astype('float64')
@@ -391,5 +392,6 @@ def test_eager(self):
             self.test_value_error()
 
 
-if (__name__ == '__main__'):
+if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index f210d97362cf0..05a4dfe3c06b6 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -109,7 +109,7 @@ def setUp(self):
         self.outputs = {'Boxes': boxes, 'Scores': scores}
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output(check_eager=False)
 
     def initTestCase(self):
         self.anchors = [10, 13, 16, 30, 33, 23]
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index 69bca8dd9ef15..66f2e871dac46 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -849,6 +849,38 @@ def ref_softsign(x):
     return out
 
 
+class XPUTestSoftshrinkOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'softshrink'
+        self.use_dynamic_create_class = False
+
+    class XPUTestSoftshrink(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "softshrink"
+            self.dtype = self.in_type
+
+            threshold = 0.5
+            np.random.seed(1023)
+            x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype)
+            out = ref_softshrink(x, threshold)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('softshrink')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftshrinkOP, stype)
+
+
+def ref_softshrink(x, threshold=0.5):
+    out = np.copy(x)
+    out = (out < -threshold) * (out + threshold) + (out > threshold) * (
+        out - threshold)
+    return out
+
+
 class XPUTestSwishOP(XPUOpTestWrapper):
     def __init__(self):
         self.op_name = 'swish'
@@ -879,5 +911,36 @@ def ref_swish(x):
     return out
 
 
+class XPUTestThresholdedReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'thresholded_relu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestThresholdedRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "thresholded_relu"
+            self.dtype = self.in_type
+
+            threshold = 1.0
+            np.random.seed(1024)
+            x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype)
+            x[np.abs(x) < 0.005] = 0.02
+            out = ref_thresholded_relu(x, threshold)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('thresholded_relu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestThresholdedReluOP, stype)
+
+
+def ref_thresholded_relu(x, threshold=1.0):
+    out = (x > threshold) * x
+    return out
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
index 78089d703891e..5f954659c2d9a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
@@ -23,6 +23,7 @@
 from op_test_xpu import XPUOpTest
 import paddle
 from paddle.fluid import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 
 def conv2d_forward_naive(input,
@@ -159,320 +160,334 @@ def init_paddings(self):
     globals()[cls_name] = TestPaddingVALIDCase
 
 
-class TestConv2DOp(XPUOpTest):
-    def setUp(self):
-        self.op_type = "conv2d"
-        self.use_cudnn = False
-        self.exhaustive_search = False
-        self.use_cuda = False
-        self.use_mkldnn = False
-        self.fuse_relu_before_depthwise_conv = False
-        self.data_format = "AnyLayout"
-        self.dtype = np.float32
-        self.init_kernel_type()
-        self.init_group()
-        self.init_dilation()
-        self.init_test_case()
-
-        conv2d_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilation': self.dilations
-        }
-
-        input = np.random.random(self.input_size).astype(self.dtype)
-        if not self.has_cuda():
+class XPUTestConv2DOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'conv2d'
+        self.use_dynamic_create_class = False
+
+    class TestConv2DOp(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "conv2d"
+            self.use_cudnn = False
+            self.exhaustive_search = False
+            self.use_cuda = False
+            self.use_mkldnn = False
             self.fuse_relu_before_depthwise_conv = False
-        if self.fuse_relu_before_depthwise_conv:
-            input = input - 0.5
-            input -= (input < 0) * 0.1
-            input += (input >= 0) * 0.1
-            input2 = np.maximum(input, 0.0)
-        else:
-            input2 = input
-        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
-
-        output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups,
-                                                  conv2d_param)
-        output = output.astype(self.dtype)
-
-        self.inputs = {
-            'Input': XPUOpTest.np_dtype_to_fluid_dtype(input),
-            'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter)
-        }
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'groups': self.groups,
-            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format,
-            'fuse_relu_before_depthwise_conv':
-            self.fuse_relu_before_depthwise_conv,
-            'exhaustive_search': self.exhaustive_search
-        }
-        self.outputs = {'Output': output}
-
-    def has_cuda(self):
-        return core.is_compiled_with_cuda() and (self.use_cudnn or
-                                                 self.use_cuda)
-
-    def test_check_output(self):
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
-                                        self.no_need_check_grad == True):
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output')
-
-    def test_check_grad_no_filter(self):
-        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
-                                        self.no_need_check_grad == True):
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'], 'Output', no_grad_set=set(['Filter']))
-
-    def test_check_grad_no_input(self):
-        if self.dtype == np.float16 or (hasattr(self, "no_need_check_grad") and
-                                        self.no_need_check_grad == True):
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'], 'Output', no_grad_set=set(['Input']))
-
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def init_test_case_2(self):
-        pass
-
-    def init_dilation(self):
-        self.dilations = [1, 1]
-
-    def init_group(self):
-        self.groups = 1
-
-    def init_kernel_type(self):
-        pass
-
-
-class TestWithPad(TestConv2DOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-
-class TestWithStride(TestConv2DOp):
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 6, 6]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-
-class TestWith1x1(TestConv2DOp):
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [120, f_c, 1, 1]
-
-    def init_group(self):
-        self.groups = 1
-
-
-# Please Don't remove the following code.
-# Currently, CI use cudnn V5.0 which not support dilation conv.
-# class TestCUDNNWithDilation(TestWithDilation):
-#     def init_op_type(self):
-#         self.op_type = "conv_cudnn"
+            self.data_format = "AnyLayout"
+            self.init_kernel_type()
+            self.init_group()
+            self.init_dilation()
+            self.init_test_case()
+
+            conv2d_param = {
+                'stride': self.stride,
+                'pad': self.pad,
+                'dilation': self.dilations
+            }
+
+            np.random.seed(100)
+            input = np.random.random(self.input_size).astype(self.dtype)
+            if not self.has_cuda():
+                self.fuse_relu_before_depthwise_conv = False
+            if self.fuse_relu_before_depthwise_conv:
+                input = input - 0.5
+                input -= (input < 0) * 0.1
+                input += (input >= 0) * 0.1
+                input2 = np.maximum(input, 0.0)
+            else:
+                input2 = input
+            np.random.seed(1)
+            filter = np.random.uniform(-1, 1,
+                                       self.filter_size).astype(self.dtype)
+
+            output, _, _, _, _ = conv2d_forward_naive(input2, filter,
+                                                      self.groups, conv2d_param)
+            output = output.astype(self.dtype)
+
+            self.inputs = {
+                'Input': XPUOpTest.np_dtype_to_fluid_dtype(input),
+                'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter)
+            }
+            self.attrs = {
+                'strides': self.stride,
+                'paddings': self.pad,
+                'groups': self.groups,
+                'dilations': self.dilations,
+                'use_cudnn': self.use_cudnn,
+                'use_mkldnn': self.use_mkldnn,
+                'data_format': self.data_format,
+                'fuse_relu_before_depthwise_conv':
+                self.fuse_relu_before_depthwise_conv,
+                'exhaustive_search': self.exhaustive_search
+            }
+            self.outputs = {'Output': output}
+
+        def has_cuda(self):
+            return core.is_compiled_with_cuda() and (self.use_cudnn or
+                                                     self.use_cuda)
+
+        def test_check_output(self):
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                           'Output')
+
+        def test_check_grad_no_filter(self):
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(
+                    self.place, ['Input'],
+                    'Output',
+                    no_grad_set=set(['Filter']))
+
+        def test_check_grad_no_input(self):
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(
+                    self.place, ['Filter'],
+                    'Output',
+                    no_grad_set=set(['Input']))
+
+        def init_test_case(self):
+            self.pad = [0, 0]
+            self.stride = [1, 1]
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 3, 3]
 
-# ---- test asymmetric padding ----
+        def init_test_case_2(self):
+            pass
+
+        def init_dilation(self):
+            self.dilations = [1, 1]
+
+        def init_group(self):
+            self.groups = 1
+
+        def init_kernel_type(self):
+            pass
+
+    class TestWithPad(TestConv2DOp):
+        def init_test_case(self):
+            self.pad = [1, 1]
+            self.stride = [1, 1]
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 3, 3]
+
+    class TestWithStride(TestConv2DOp):
+        def init_test_case(self):
+            self.pad = [1, 1]
+            self.stride = [2, 2]
+            self.input_size = [2, 3, 6, 6]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 3, 3]
+
+    class TestWith1x1(TestConv2DOp):
+        def init_test_case(self):
+            self.pad = [0, 0]
+            self.stride = [1, 1]
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [120, f_c, 1, 1]
 
+        def init_group(self):
+            self.groups = 1
 
-class TestConv2DOp_v2(XPUOpTest):
-    def setUp(self):
-        self.op_type = "conv2d"
-        self.use_cudnn = False
-        self.exhaustive_search = False
-        self.use_cuda = False
-        self.use_mkldnn = False
-        self.fuse_relu_before_depthwise_conv = False
-        self.dtype = np.float32
-        self.init_kernel_type()
-        self.init_group()
-        self.init_dilation()
-        self.init_data_format()
-        self.init_test_case()
-        self.init_paddings()
-        self.init_test_case_2()
-
-        conv2d_param = {
-            'stride': self.stride,
-            'pad': self.pad,
-            'dilation': self.dilations
-        }
-
-        input = np.random.random(self.input_size).astype(self.dtype)
-        if not self.has_cuda():
+
+# ---- test asymmetric padding ----
+class XPUTestConv2DOp_v2(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'conv2d'
+        self.use_dynamic_create_class = False
+
+    class TestConv2DOp_v2(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "conv2d"
+            self.use_cudnn = False
+            self.exhaustive_search = False
+            self.use_cuda = False
+            self.use_mkldnn = False
             self.fuse_relu_before_depthwise_conv = False
-        if self.fuse_relu_before_depthwise_conv:
-            input = input - 0.5
-            input -= (input < 0) * 0.1
-            input += (input >= 0) * 0.1
-            input2 = np.maximum(input, 0.0)
-        else:
-            input2 = input
-        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
-        output, _, _, _, _ = conv2d_forward_naive(
-            input2, filter, self.groups, conv2d_param, self.padding_algorithm,
-            self.data_format)
-        output = output.astype(self.dtype)
-
-        self.inputs = {
-            'Input': XPUOpTest.np_dtype_to_fluid_dtype(input),
-            'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter)
-        }
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'padding_algorithm': self.padding_algorithm,
-            'groups': self.groups,
-            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format,
-            'fuse_relu_before_depthwise_conv':
-            self.fuse_relu_before_depthwise_conv,
-            'exhaustive_search': self.exhaustive_search
-        }
-        self.outputs = {'Output': output}
-
-    def has_cuda(self):
-        return core.is_compiled_with_cuda() and (self.use_cudnn or
-                                                 self.use_cuda)
-
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, {'Input', 'Filter'}, 'Output')
-
-    def test_check_grad_no_filter(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'], 'Output', no_grad_set=set(['Filter']))
-
-    def test_check_grad_no_input(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        if self.dtype == np.float16:
-            return
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'], 'Output', no_grad_set=set(['Input']))
-
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 4, 3]
-
-    def init_dilation(self):
-        self.dilations = [1, 1]
-
-    def init_group(self):
-        self.groups = 1
-
-    def init_kernel_type(self):
-        pass
-
-    def init_paddings(self):
-        self.pad = [0, 0]
-        self.padding_algorithm = "EXPLICIT"
-
-    def init_data_format(self):
-        self.data_format = "NCHW"
-
-    def init_test_case_2(self):
-        pass
-
-
-class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
-    def init_paddings(self):
-        self.pad = [0, 0, 0, 0]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestWithPad_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def init_paddings(self):
-        self.pad = [1, 1, 1, 1]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestWithStride_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 6, 6]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def init_paddings(self):
-        self.pad = [1, 1, 1, 1]
-        self.padding_algorithm = "EXPLICIT"
+            self.init_kernel_type()
+            self.init_group()
+            self.init_dilation()
+            self.init_data_format()
+            self.init_test_case()
+            self.init_paddings()
+            self.init_test_case_2()
+
+            conv2d_param = {
+                'stride': self.stride,
+                'pad': self.pad,
+                'dilation': self.dilations
+            }
+
+            np.random.seed(100)
+            input = np.random.random(self.input_size).astype(self.dtype)
+            if not self.has_cuda():
+                self.fuse_relu_before_depthwise_conv = False
+            if self.fuse_relu_before_depthwise_conv:
+                input = input - 0.5
+                input -= (input < 0) * 0.1
+                input += (input >= 0) * 0.1
+                input2 = np.maximum(input, 0.0)
+            else:
+                input2 = input
+            np.random.seed(8)
+            filter = np.random.uniform(-1, 1,
+                                       self.filter_size).astype(self.dtype)
+            output, _, _, _, _ = conv2d_forward_naive(
+                input2, filter, self.groups, conv2d_param,
+                self.padding_algorithm, self.data_format)
+            output = output.astype(self.dtype)
+
+            self.inputs = {
+                'Input': XPUOpTest.np_dtype_to_fluid_dtype(input),
+                'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter)
+            }
+            self.attrs = {
+                'strides': self.stride,
+                'paddings': self.pad,
+                'padding_algorithm': self.padding_algorithm,
+                'groups': self.groups,
+                'dilations': self.dilations,
+                'use_cudnn': self.use_cudnn,
+                'use_mkldnn': self.use_mkldnn,
+                'data_format': self.data_format,
+                'fuse_relu_before_depthwise_conv':
+                self.fuse_relu_before_depthwise_conv,
+                'exhaustive_search': self.exhaustive_search
+            }
+            self.outputs = {'Output': output}
+
+        def has_cuda(self):
+            return core.is_compiled_with_cuda() and (self.use_cudnn or
+                                                     self.use_cuda)
+
+        def test_check_output(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_output_with_place(place=self.place)
+
+        def test_check_grad(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(self.place, {'Input', 'Filter'},
+                                           'Output')
+
+        def test_check_grad_no_filter(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(
+                    self.place, ['Input'],
+                    'Output',
+                    no_grad_set=set(['Filter']))
+
+        def test_check_grad_no_input(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            if (hasattr(self, "no_need_check_grad") and
+                    self.no_need_check_grad == True):
+                return
+            if core.is_compiled_with_xpu():
+                paddle.enable_static()
+                self.check_grad_with_place(
+                    self.place, ['Filter'],
+                    'Output',
+                    no_grad_set=set(['Input']))
+
+        def init_test_case(self):
+            self.pad = [0, 0]
+            self.stride = [1, 2]
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 4, 3]
+
+        def init_dilation(self):
+            self.dilations = [1, 1]
+
+        def init_group(self):
+            self.groups = 1
+
+        def init_kernel_type(self):
+            pass
+
+        def init_paddings(self):
+            self.pad = [0, 0]
+            self.padding_algorithm = "EXPLICIT"
+
+        def init_data_format(self):
+            self.data_format = "NCHW"
+
+        def init_test_case_2(self):
+            pass
+
+    class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
+        def init_paddings(self):
+            self.pad = [0, 0, 0, 0]
+            self.padding_algorithm = "EXPLICIT"
+
+    class TestWithPad_AsyPadding(TestConv2DOp_v2):
+        def init_test_case(self):
+            self.stride = [1, 1]
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 3, 3]
+
+        def init_paddings(self):
+            self.pad = [1, 1, 1, 1]
+            self.padding_algorithm = "EXPLICIT"
+
+    class TestWithStride_AsyPadding(TestConv2DOp_v2):
+        def init_test_case(self):
+            self.stride = [2, 2]
+            self.input_size = [2, 3, 6, 6]  # NCHW
+            assert np.mod(self.input_size[1], self.groups) == 0
+            f_c = self.input_size[1] // self.groups
+            self.filter_size = [6, f_c, 3, 3]
+
+        def init_paddings(self):
+            self.pad = [1, 1, 1, 1]
+            self.padding_algorithm = "EXPLICIT"
+
 
+support_types = get_xpu_op_support_types('conv2d')
+for stype in support_types:
+    create_test_class(globals(), XPUTestConv2DOp, stype)
+    create_test_class(globals(), XPUTestConv2DOp_v2, stype)
 
 #---------- test SAME VALID -----------
 #create_test_padding_SAME_class(TestConv2DOp_AsyPadding)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
index 2ad79dd0cca00..9999217041859 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
@@ -21,6 +21,8 @@
 import sys
 sys.path.append("../")
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types
+from xpu.get_test_cover_info import XPUOpTestWrapper
 
 paddle.enable_static()
 np.set_printoptions(threshold=np.inf)
@@ -73,188 +75,198 @@ def seqconv(x,
     return np.dot(col, filter)
 
 
-class TestSeqProject(XPUOpTest):
-    def setUp(self):
-        self.init_test_case()
-        self.op_type = 'sequence_conv'
-        self.use_xpu = True
-
-        if self.context_length == 1 \
-                and self.context_start == 0 \
-                and self.padding_trainable:
-            print("If context_start is 0 " \
-                  "and context_length is 1," \
-                  " padding_trainable should be false.")
-            return
-
-        # one level, batch size
-        x = np.random.uniform(-6.10907e-05, 0.000104218,
-                              [self.input_size[0],
-                               self.input_size[1]]).astype('float32')
-        w = np.random.uniform(-3.17068e-05, 0.000159822, [
-            self.context_length * self.input_size[1], self.output_represention
-        ]).astype('float32')
-
-        begin_pad = np.max([0, -self.context_start])
-        end_pad = np.max([0, self.context_start + self.context_length - 1])
-        total_pad = begin_pad + end_pad
-        padding_data = np.random.uniform(
-            0, 0, [total_pad, self.input_size[1]]).astype('float32')
-        self.pad_data = padding_data
-        self.inputs = {
-            'X': (x, self.lod),
-            'Filter': w,
-        }
-        self.inputs_val = ['X', 'Filter']
-        self.inputs_val_no_x = ['Filter']
-        self.inputs_val_no_f = ['X']
-
-        if total_pad != 0:
-            self.inputs['PaddingData'] = padding_data
-            self.inputs_val = ['X', 'PaddingData', 'Filter']
-            self.inputs_val_no_x = ['PaddingData', 'Filter']
-            self.inputs_val_no_f = ['PaddingData', 'X']
-
-        self.attrs = {
-            'contextStart': self.context_start,
-            'contextLength': self.context_length,
-            'paddingTrainable': self.padding_trainable,
-            'contextStride': self.context_stride
-        }
-        out = seqconv(x, self.lod, w, self.context_length, self.context_start,
-                      self.padding_trainable, self.pad_data)
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad_input(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x))
-
-    def test_check_grad_padding_data(self):
-        if self.padding_trainable:
+class XPUTestSequenceConv(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'sequence_conv'
+
+    class TestSeqProject(XPUOpTest):
+        def setUp(self):
+            self.init_test_case()
+            self.op_type = 'sequence_conv'
+            self.dtype = self.in_type
+            self.use_xpu = True
+
+            if self.context_length == 1 \
+                    and self.context_start == 0 \
+                    and self.padding_trainable:
+                print("If context_start is 0 " \
+                      "and context_length is 1," \
+                      " padding_trainable should be false.")
+                return
+
+            # one level, batch size
+            x = np.random.uniform(-6.10907e-05, 0.000104218,
+                                  [self.input_size[0],
+                                   self.input_size[1]]).astype(self.dtype)
+            w = np.random.uniform(-3.17068e-05, 0.000159822, [
+                self.context_length * self.input_size[1],
+                self.output_represention
+            ]).astype(self.dtype)
+
+            begin_pad = np.max([0, -self.context_start])
+            end_pad = np.max([0, self.context_start + self.context_length - 1])
+            total_pad = begin_pad + end_pad
+            padding_data = np.random.uniform(
+                0, 0, [total_pad, self.input_size[1]]).astype(self.dtype)
+            self.pad_data = padding_data
+            self.inputs = {
+                'X': (x, self.lod),
+                'Filter': w,
+            }
+            self.inputs_val = ['X', 'Filter']
+            self.inputs_val_no_x = ['Filter']
+            self.inputs_val_no_f = ['X']
+
+            if total_pad != 0:
+                self.inputs['PaddingData'] = padding_data
+                self.inputs_val = ['X', 'PaddingData', 'Filter']
+                self.inputs_val_no_x = ['PaddingData', 'Filter']
+                self.inputs_val_no_f = ['PaddingData', 'X']
+
+            self.attrs = {
+                'contextStart': self.context_start,
+                'contextLength': self.context_length,
+                'paddingTrainable': self.padding_trainable,
+                'contextStride': self.context_stride
+            }
+            out = seqconv(x, self.lod, w, self.context_length,
+                          self.context_start, self.padding_trainable,
+                          self.pad_data)
+            self.outputs = {'Out': out}
+
+        def test_check_output(self):
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+        def test_check_grad_input(self):
+            self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x))
+
+        def test_check_grad_padding_data(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
+
+        def test_check_grad_Filter(self):
             self.check_grad(
-                ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter']))
-
-    def test_check_grad_Filter(self):
-        self.check_grad(
-            ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f))
-
-    def test_check_grad_input_filter(self):
-        if self.padding_trainable:
-            self.check_grad(
-                ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData']))
-
-    def test_check_grad_padding_input(self):
-        if self.padding_trainable:
-            self.check_grad(
-                self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter']))
-
-    def test_check_grad_padding_filter(self):
-        if self.padding_trainable:
-            self.check_grad(self.inputs_val_no_x, 'Out', no_grad_set=set(['X']))
-
-    def init_test_case(self):
-        self.input_row = 7
-        self.input_col = 25
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, self.input_col]
-        offset_lod = [[0, 1, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase1(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 50]
-        offset_lod = [[0, 4, 5, 8, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase2Len0(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 11
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 50]
-        offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase3(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 25
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, 25]
-        idx = list(range(self.input_size[0]))
-        del idx[0]
-        offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
-                      [self.input_size[0]]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
-
-
-class TestSeqProjectCase4(TestSeqProject):
-    def init_test_case(self):
-        self.input_row = 7835
-        self.input_col = 128
-        self.context_start = -2
-        self.context_length = 5
-        self.padding_trainable = False
-        self.context_stride = 1
-
-        self.input_size = [self.input_row, self.input_col]
-        offset_lod = [[
-            0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387, 515,
-            516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073, 1074, 1202,
-            1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876, 1912, 1913, 1914,
-            2032, 2066, 2194, 2308, 2309, 2347, 2475, 2476, 2477, 2478, 2606,
-            2607, 2735, 2736, 2737, 2738, 2838, 2966, 2967, 2968, 2969, 3097,
-            3225, 3353, 3481, 3482, 3520, 3642, 3643, 3754, 3882, 3883, 4010,
-            4011, 4012, 4140, 4219, 4228, 4356, 4357, 4415, 4475, 4476, 4604,
-            4605, 4606, 4694, 4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260,
-            5312, 5440, 5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939,
-            6021, 6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867,
-            6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595, 7699,
-            7827, 7835
-        ]]
-        self.lod = [[]]
-        # convert from offset-based lod to length-based lod
-        for i in range(len(offset_lod[0]) - 1):
-            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
-        self.output_represention = 8  # output feature size
+                ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f))
+
+        def test_check_grad_input_filter(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData']))
+
+        def test_check_grad_padding_input(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter']))
+
+        def test_check_grad_padding_filter(self):
+            if self.padding_trainable:
+                self.check_grad(
+                    self.inputs_val_no_x, 'Out', no_grad_set=set(['X']))
+
+        def init_test_case(self):
+            self.input_row = 7
+            self.input_col = 25
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, self.input_col]
+            offset_lod = [[0, 1, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase1(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 11
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 50]
+            offset_lod = [[0, 4, 5, 8, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase2Len0(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 11
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 50]
+            offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase3(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 25
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, 25]
+            idx = list(range(self.input_size[0]))
+            del idx[0]
+            offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
+                          [self.input_size[0]]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+    class TestSeqProjectCase4(TestSeqProject):
+        def init_test_case(self):
+            self.input_row = 7835
+            self.input_col = 128
+            self.context_start = -2
+            self.context_length = 5
+            self.padding_trainable = False
+            self.context_stride = 1
+
+            self.input_size = [self.input_row, self.input_col]
+            offset_lod = [[
+                0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387,
+                515, 516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073,
+                1074, 1202, 1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876,
+                1912, 1913, 1914, 2032, 2066, 2194, 2308, 2309, 2347, 2475,
+                2476, 2477, 2478, 2606, 2607, 2735, 2736, 2737, 2738, 2838,
+                2966, 2967, 2968, 2969, 3097, 3225, 3353, 3481, 3482, 3520,
+                3642, 3643, 3754, 3882, 3883, 4010, 4011, 4012, 4140, 4219,
+                4228, 4356, 4357, 4415, 4475, 4476, 4604, 4605, 4606, 4694,
+                4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260, 5312, 5440,
+                5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939, 6021,
+                6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867,
+                6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595,
+                7699, 7827, 7835
+            ]]
+            self.lod = [[]]
+            # convert from offset-based lod to length-based lod
+            for i in range(len(offset_lod[0]) - 1):
+                self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
+            self.output_represention = 8  # output feature size
+
+
+support_types = get_xpu_op_support_types('sequence_conv')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSequenceConv, stype)
 
 
 class TestSeqConvApi(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
index 8f3578b526e1e..3d7c9959db9ea 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
@@ -18,169 +18,174 @@
 import unittest
 sys.path.append("..")
 from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {'Input': self.input}
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags,
-            "use_xpu": True
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [3, 3, 4]
-        self.axes = [0, 1, 2]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1:3, 0:3, 2:4, :]
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad_normal(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(place, ['Input'], 'Out')
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestCase1(TestSliceOp):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-3, 0, 2]
-        self.ends = [3, 100, -1]
-        self.axes = [0, 1, 2]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-3:3, 0:100, 2:-1, :]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestCase2(TestSliceOp):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-3, 0, 2]
-        self.ends = [3, 100, -1]
-        self.axes = [0, 1, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-3:3, 0:100, :, 2:-1]
+class XPUTestSliceOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'slice'
+        self.use_dynamic_create_class = False
+
+    class TestSliceOp(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "slice"
+            self.config()
+            self.inputs = {'Input': self.input}
+            self.outputs = {'Out': self.out}
+            self.attrs = {
+                'axes': self.axes,
+                'starts': self.starts,
+                'ends': self.ends,
+                'infer_flags': self.infer_flags,
+                "use_xpu": True
+            }
+
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [1, 0, 2]
+            self.ends = [3, 3, 4]
+            self.axes = [0, 1, 2]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[1:3, 0:3, 2:4, :]
+
+        def test_check_grad_normal(self):
+            if self.dtype == np.float16:
+                self.check_grad_with_place(self.place, ['Input'], 'Out')
+            else:
+                user_defined_grad_outputs = np.random.random(
+                    self.out.shape).astype(self.dtype)
+                self.check_grad_with_place(
+                    self.place, ['Input'],
+                    'Out',
+                    user_defined_grad_outputs=user_defined_grad_outputs)
+
+    class TestCase1(TestSliceOp):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-3, 0, 2]
+            self.ends = [3, 100, -1]
+            self.axes = [0, 1, 2]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[-3:3, 0:100, 2:-1, :]
+
+    class TestCase2(TestSliceOp):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-3, 0, 2]
+            self.ends = [3, 100, -1]
+            self.axes = [0, 1, 3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[-3:3, 0:100, :, 2:-1]
 
 
 # 1.2 with attr(decrease)
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim(OpTest):
-    def setUp(self):
-        self.op_type = "slice"
-        self.config()
-        self.inputs = {'Input': self.input}
-        self.outputs = {'Out': self.out}
-        self.attrs = {
-            'axes': self.axes,
-            'starts': self.starts,
-            'ends': self.ends,
-            'infer_flags': self.infer_flags,
-            'decrease_axis': self.decrease_axis,
-            "use_xpu": True
-        }
-
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 3, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1, 0:3, 2:4, :]
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad_normal(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(place, ['Input'], 'Out')
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [1, 0, 2]
-        self.ends = [2, 1, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0, 1]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[1, 0, 2:4, :]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-1, 0, 2]
-        self.ends = [1000000, 1, 4]
-        self.axes = [0, 1, 2]
-        self.decrease_axis = [0, 1]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[-1, 0, 2:4, :]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 7]).astype("float32")
-        self.starts = [0, 1, 2, 3]
-        self.ends = [1, 2, 3, 4]
-        self.axes = [0, 1, 2, 3]
-        self.decrease_axis = [0, 1, 2, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[0, 1, 2, 3:4]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [-1]
-        self.ends = [1000000]
-        self.axes = [3]
-        self.decrease_axis = [3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[:, :, :, -1]
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
-    def config(self):
-        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
-        self.starts = [0, 1, 2, 3]
-        self.ends = [1, 2, 3, 4]
-        self.axes = [0, 1, 2, 3]
-        self.decrease_axis = [0, 1, 2, 3]
-        self.infer_flags = [1, 1, 1]
-        self.out = self.input[0, 1, 2, 3:4]
-
+class XPUTestSliceOp_decs_dim(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'slice'
+        self.use_dynamic_create_class = False
+
+    class TestSliceOp_decs_dim(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "slice"
+            self.config()
+            self.inputs = {'Input': self.input}
+            self.outputs = {'Out': self.out}
+            self.attrs = {
+                'axes': self.axes,
+                'starts': self.starts,
+                'ends': self.ends,
+                'infer_flags': self.infer_flags,
+                'decrease_axis': self.decrease_axis,
+                "use_xpu": True
+            }
+
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [1, 0, 2]
+            self.ends = [2, 3, 4]
+            self.axes = [0, 1, 2]
+            self.decrease_axis = [0]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[1, 0:3, 2:4, :]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad_normal(self):
+            if self.dtype == np.float16:
+                self.check_grad_with_place(self.place, ['Input'], 'Out')
+            else:
+                user_defined_grad_outputs = np.random.random(
+                    self.out.shape).astype(self.dtype)
+                self.check_grad_with_place(
+                    self.place, ['Input'],
+                    'Out',
+                    user_defined_grad_outputs=user_defined_grad_outputs)
+
+    class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [1, 0, 2]
+            self.ends = [2, 1, 4]
+            self.axes = [0, 1, 2]
+            self.decrease_axis = [0, 1]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[1, 0, 2:4, :]
+
+    class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-1, 0, 2]
+            self.ends = [1000000, 1, 4]
+            self.axes = [0, 1, 2]
+            self.decrease_axis = [0, 1]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[-1, 0, 2:4, :]
+
+    class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype)
+            self.starts = [0, 1, 2, 3]
+            self.ends = [1, 2, 3, 4]
+            self.axes = [0, 1, 2, 3]
+            self.decrease_axis = [0, 1, 2, 3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[0, 1, 2, 3:4]
+
+    class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [-1]
+            self.ends = [1000000]
+            self.axes = [3]
+            self.decrease_axis = [3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[:, :, :, -1]
+
+    class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
+        def config(self):
+            self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+            self.starts = [0, 1, 2, 3]
+            self.ends = [1, 2, 3, 4]
+            self.axes = [0, 1, 2, 3]
+            self.decrease_axis = [0, 1, 2, 3]
+            self.infer_flags = [1, 1, 1]
+            self.out = self.input[0, 1, 2, 3:4]
+
+
+support_types = get_xpu_op_support_types('slice')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSliceOp, stype)
+    create_test_class(globals(), XPUTestSliceOp_decs_dim, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
index d010e1633578e..cd18bd63a88f7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
@@ -24,221 +24,158 @@
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 from paddle.fluid import core
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 np.random.seed(10)
 
 
 #Situation 1: repeat_times is a list (without tensor)
-class TestTileOpRank1(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.init_data()
-
-        self.inputs = {'X': np.random.random(self.ori_shape).astype("float32")}
-        self.attrs = {'repeat_times': self.repeat_times}
-        output = np.tile(self.inputs['X'], self.repeat_times)
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def init_data(self):
-        self.ori_shape = [100]
-        self.repeat_times = [2]
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def test_check_grad(self):
-        pass
-
-
-#with dimension expanding
-class TestTileOpRank2Expanding(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = [120]
-        self.repeat_times = [2, 2]
-
-
-class TestTileOpRank2(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [2, 3]
-
-
-class TestTileOpRank3_Corner(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 10, 5)
-        self.repeat_times = (1, 1, 1)
-
-
-class TestTileOpRank3_Corner2(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 10, 5)
-        self.repeat_times = (2, 2)
-
-
-class TestTileOpRank3(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 4, 15)
-        self.repeat_times = (2, 1, 4)
-
-
-class TestTileOpRank4(TestTileOpRank1):
-    def init_data(self):
-        self.ori_shape = (2, 4, 5, 7)
-        self.repeat_times = (3, 2, 1, 2)
+class XPUTestTileOpRank1(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tile'
+        self.use_dynamic_create_class = False
+
+    class TestTileOpRank1(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "tile"
+            self.init_data()
+            self.inputs = {
+                'X': np.random.random(self.ori_shape).astype(self.dtype)
+            }
+            self.attrs = {'repeat_times': self.repeat_times}
+            output = np.tile(self.inputs['X'], self.repeat_times)
+            self.outputs = {'Out': output}
+
+        def init_data(self):
+            self.ori_shape = [100]
+            self.repeat_times = [2]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+    #with dimension expanding
+    class TestTileOpRank2Expanding(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = [120]
+            self.repeat_times = [2, 2]
+
+    class TestTileOpRank2(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [2, 3]
+
+    class TestTileOpRank3_Corner(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 10, 5)
+            self.repeat_times = (1, 1, 1)
+
+    class TestTileOpRank3_Corner2(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 10, 5)
+            self.repeat_times = (2, 2)
+
+    class TestTileOpRank3(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 4, 15)
+            self.repeat_times = (2, 1, 4)
+
+    class TestTileOpRank4(TestTileOpRank1):
+        def init_data(self):
+            self.ori_shape = (2, 4, 5, 7)
+            self.repeat_times = (3, 2, 1, 2)
 
 
 # Situation 2: repeat_times is a list (with tensor)
-class TestTileOpRank1_tensor_attr(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.init_data()
-        repeat_times_tensor = []
-        for index, ele in enumerate(self.repeat_times):
-            repeat_times_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float32"),
-            'repeat_times_tensor': repeat_times_tensor,
-        }
-        self.attrs = {"repeat_times": self.infer_repeat_times}
-        output = np.tile(self.inputs['X'], self.repeat_times)
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def init_data(self):
-        self.ori_shape = [100]
-        self.repeat_times = [2]
-        self.infer_repeat_times = [-1]
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def test_check_grad(self):
-        pass
-
-
-class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [1, 1]
-        self.infer_repeat_times = [1, -1]
-
-
-class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [2, 3]
-        self.infer_repeat_times = [-1, 3]
+class XPUTestTileOpRank1_tensor_attr(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tile'
+        self.use_dynamic_create_class = False
+
+    class TestTileOpRank1_tensor_attr(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "tile"
+            self.init_data()
+            repeat_times_tensor = []
+            for index, ele in enumerate(self.repeat_times):
+                repeat_times_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+
+            self.inputs = {
+                'X': np.random.random(self.ori_shape).astype(self.dtype),
+                'repeat_times_tensor': repeat_times_tensor,
+            }
+            self.attrs = {"repeat_times": self.infer_repeat_times}
+            output = np.tile(self.inputs['X'], self.repeat_times)
+            self.outputs = {'Out': output}
+
+        def init_data(self):
+            self.ori_shape = [100]
+            self.repeat_times = [2]
+            self.infer_repeat_times = [-1]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+    class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [1, 1]
+            self.infer_repeat_times = [1, -1]
+
+    class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [2, 3]
+            self.infer_repeat_times = [-1, 3]
 
 
 # Situation 3: repeat_times is a tensor
-class TestTileOpRank1_tensor(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.init_data()
-
-        self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float32"),
-            'RepeatTimes': np.array(self.repeat_times).astype("int32"),
-        }
-        self.attrs = {}
-        output = np.tile(self.inputs['X'], self.repeat_times)
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def init_data(self):
-        self.ori_shape = [100]
-        self.repeat_times = [2]
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-    def test_check_grad(self):
-        pass
-
-
-class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
-    def init_data(self):
-        self.ori_shape = [12, 14]
-        self.repeat_times = [2, 3]
-
-
-# Situation 4: input x is Integer
-class TestTileOpInteger(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.inputs = {
-            'X': np.random.randint(
-                10, size=(4, 4, 5)).astype("int32")
-        }
-        self.attrs = {'repeat_times': [2, 1, 4]}
-        output = np.tile(self.inputs['X'], (2, 1, 4))
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-
-# Situation 5: input x is Integer
-class TestTileOpInt64_t(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("int64")
-        }
-        self.attrs = {'repeat_times': [2, 1, 4]}
-        output = np.tile(self.inputs['X'], (2, 1, 4))
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
-
-
-# Situation 6: input x is Bool
-class TestTileOpBool(XPUOpTest):
-    def setUp(self):
-        self.set_xpu()
-        self.place = paddle.XPUPlace(0)
-        self.op_type = "tile"
-        self.inputs = {
-            'X': np.random.randint(
-                10, size=(2, 4, 5)).astype("bool")
-        }
-        self.attrs = {'repeat_times': [2, 1, 4]}
-        output = np.tile(self.inputs['X'], (2, 1, 4))
-        self.outputs = {'Out': output}
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
+class XPUTestTileOpRank1_tensor(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tile'
+        self.use_dynamic_create_class = False
+
+    class TestTileOpRank1_tensor(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "tile"
+            self.init_data()
+
+            self.inputs = {
+                'X': np.random.random(self.ori_shape).astype(self.dtype),
+                'RepeatTimes': np.array(self.repeat_times).astype("int32"),
+            }
+            self.attrs = {}
+            output = np.tile(self.inputs['X'], self.repeat_times)
+            self.outputs = {'Out': output}
+
+        def init_data(self):
+            self.ori_shape = [100]
+            self.repeat_times = [2]
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+    class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
+        def init_data(self):
+            self.ori_shape = [12, 14]
+            self.repeat_times = [2, 3]
+
+
+support_types = get_xpu_op_support_types('tile')
+for stype in support_types:
+    create_test_class(globals(), XPUTestTileOpRank1, stype)
+    create_test_class(globals(), XPUTestTileOpRank1_tensor_attr, stype)
+    create_test_class(globals(), XPUTestTileOpRank1_tensor, stype)
 
 
 # Test python API
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index f3763cb447f39..1c7e4fb5f1ad0 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -382,7 +382,7 @@ def _getitem_impl_(var, item):
             idx = assign(np.array(slice_item).astype("int32"))
             return index_select(var, index=idx, axis=0)
 
-        elif isinstance(slice_item, (Variable)):
+        elif isinstance(slice_item, (Variable, core.eager.Tensor)):
             if len(item) == 1:
 
                 from ..tensor import index_select, gather_nd
@@ -636,7 +636,7 @@ def _setitem_impl_(var, item, value):
         shape = list(value.shape)
         if dtype == core.VarDesc.VarType.BOOL:
             value_name = "bool_values"
-            values = [bool(v) for v in value.flat]
+            values = [int(v) for v in value.flat]
         elif dtype == core.VarDesc.VarType.FP32:
             value_name = "fp32_values"
             values = [float(v) for v in value.flat]
@@ -657,7 +657,7 @@ def _setitem_impl_(var, item, value):
         attrs[value_name] = values
         attrs["shape"] = shape
 
-    elif isinstance(value, Variable):
+    elif isinstance(value, (Variable, core.eager.Tensor)):
         inputs["ValueTensor"] = value
     else:
         raise TypeError(
@@ -665,7 +665,9 @@ def _setitem_impl_(var, item, value):
             "paddle.Tensor to a paddle.Tensor, but received {}".format(
                 type(value)))
 
-    if paddle.fluid.framework.in_dygraph_mode():
+    if paddle.fluid.framework.in_dygraph_mode(
+    ) and not paddle.fluid.framework._in_eager_mode():
+        # TODO(pangyoki) add inplace(BumpInplaceVersion) if need
         var._bump_inplace_version()
 
     cur_block = default_main_program().current_block()
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 15d5640b11fe5..59e285c1200b8 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -68,8 +68,9 @@ def to_list(value):
 
 
 def to_numpy(var):
-    assert isinstance(var, (Variable, fluid.core.VarBase)), "not a variable"
-    if isinstance(var, fluid.core.VarBase):
+    assert isinstance(var, (Variable, fluid.core.VarBase,
+                            fluid.core.eager.Tensor)), "not a variable"
+    if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)):
         return var.numpy()
     t = global_scope().find_var(var.name).get_tensor()
     return np.array(t)
diff --git a/python/paddle/incubate/operators/graph_send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py
index 45810621e4207..05f6a80a442f2 100644
--- a/python/paddle/incubate/operators/graph_send_recv.py
+++ b/python/paddle/incubate/operators/graph_send_recv.py
@@ -19,7 +19,12 @@
 from paddle import _C_ops
 
 
-def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
+def graph_send_recv(x,
+                    src_index,
+                    dst_index,
+                    pool_type="sum",
+                    out_size=None,
+                    name=None):
     r"""
 
     Graph Learning Send_Recv combine operator.
@@ -27,7 +32,7 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
     This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
     consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
     to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor 
-    in different pooling types, like sum, mean, max, or min.
+    in different pooling types, like sum, mean, max, or min. Besides, we can set `out_size` to get necessary output shape.
 
     .. code-block:: text
 
@@ -43,6 +48,8 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
 
            pool_type = "sum"
 
+           out_size = None
+
            Then:
 
            Out = [[0, 2, 3],
@@ -56,6 +63,9 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
                             The available data type is int32, int64. 
         pool_type (str): The pooling type of graph_send_recv, including `sum`, `mean`, `max`, `min`.
                          Default value is `sum`.
+        out_size (int64|None): We can set `out_size` to get necessary output shape. If not set, then this 
+                              attribute will not be used. If set, it should be equal with or larger than
+                              max(dst_index) + 1.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
@@ -75,6 +85,21 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
             out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum")
             # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]]
 
+            x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+            indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32")
+            src_index = indexes[:, 0]
+            dst_index = indexes[:, 1]
+            out_size = paddle.max(dst_index) + 1
+            out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum", out_size=out_size)
+            # Outputs: [[0., 2., 3.], [[2., 8., 10.]]]
+
+            x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+            indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32")
+            src_index = indexes[:, 0]
+            dst_index = indexes[:, 1]
+            out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum")
+            # Outputs: [[0., 2., 3.], [2., 8., 10.], [0., 0., 0.]]
+
     """
 
     if pool_type not in ["sum", "mean", "max", "min"]:
@@ -82,9 +107,16 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
             "pool_type should be `sum`, `mean`, `max` or `min`, but received %s"
             % pool_type)
 
+    # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1.
+
     if in_dygraph_mode():
-        out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index, 'pool_type',
-                                          pool_type.upper())
+        if out_size is None or out_size <= 0:
+            out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index,
+                                              'pool_type', pool_type.upper())
+        else:
+            out, tmp = _C_ops.graph_send_recv(
+                x, src_index, dst_index, 'pool_type',
+                pool_type.upper(), 'out_size', out_size)
         return out
 
     check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"),
@@ -105,5 +137,8 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
                 "Dst_index": dst_index},
         outputs={"Out": out,
                  "Dst_count": dst_count},
-        attrs={"pool_type": pool_type.upper()})
+        attrs={
+            "pool_type": pool_type.upper(),
+            "out_size": 0 if out_size is None or out_size <= 0 else out_size
+        })
     return out
diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py
index 9f577d5ff3802..2d0b079ee9280 100644
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
@@ -29,7 +29,7 @@ def segment_sum(data, segment_ids, name=None):
     where sum is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (Tensor): A tensor, available data type float32, float64.
+        data (Tensor): A tensor, available data type float32, float64, int32, int64.
         segment_ids (Tensor): A 1-D tensor, which have the same size
                             with the first dimension of input data. 
                             Available data type is int32, int64.
@@ -54,7 +54,8 @@ def segment_sum(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -82,7 +83,7 @@ def segment_mean(data, segment_ids, name=None):
     of all index 'segment_ids[j] == i'.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size 
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -107,7 +108,8 @@ def segment_mean(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -134,7 +136,7 @@ def segment_min(data, segment_ids, name=None):
     where min is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -159,7 +161,8 @@ def segment_min(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
@@ -186,7 +189,7 @@ def segment_max(data, segment_ids, name=None):
     where max is over j such that `segment_ids[j] == i`.
 
     Args:
-        data (tensor): a tensor, available data type float32, float64.
+        data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size
                             with the first dimension of input data. 
                             available data type is int32, int64.
@@ -211,7 +214,8 @@ def segment_max(data, segment_ids, name=None):
         out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MAX")
         return out
 
-    check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool")
+    check_variable_and_dtype(data, "X", ("float32", "float64", "int32",
+                                         "int64"), "segment_pool")
     check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
                              "segment_pool")
 
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index d75c95b437201..ef62aa264fb26 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -282,7 +282,7 @@ def update(self, correct, *args):
         Return:
             Tensor: the accuracy of current step.
         """
-        if isinstance(correct, paddle.Tensor):
+        if isinstance(correct, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             correct = correct.numpy()
         num_samples = np.prod(np.array(correct.shape[:-1]))
         accs = []
@@ -410,12 +410,12 @@ def update(self, preds, labels):
                 the shape should keep the same as preds.
                 The data type is 'int32' or 'int64'.
         """
-        if isinstance(preds, paddle.Tensor):
+        if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             preds = preds.numpy()
         elif not _is_numpy_(preds):
             raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
 
-        if isinstance(labels, paddle.Tensor):
+        if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             labels = labels.numpy()
         elif not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
@@ -543,12 +543,12 @@ def update(self, preds, labels):
                 the shape should keep the same as preds.
                 Shape: [batch_size, 1], Dtype: 'int32' or 'int64'.
         """
-        if isinstance(preds, paddle.Tensor):
+        if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             preds = preds.numpy()
         elif not _is_numpy_(preds):
             raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
 
-        if isinstance(labels, paddle.Tensor):
+        if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             labels = labels.numpy()
         elif not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
@@ -698,12 +698,12 @@ def update(self, preds, labels):
                 (batch_size, 1), labels[i] is either o or 1,
                 representing the label of the instance i.
         """
-        if isinstance(labels, paddle.Tensor):
+        if isinstance(labels, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             labels = labels.numpy()
         elif not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray or Tensor.")
 
-        if isinstance(preds, paddle.Tensor):
+        if isinstance(preds, (paddle.Tensor, paddle.fluid.core.eager.Tensor)):
             preds = preds.numpy()
         elif not _is_numpy_(preds):
             raise ValueError("The 'preds' must be a numpy ndarray or Tensor.")
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 91449ef538ff3..11d2ad6fa8826 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -22,7 +22,7 @@
 
 import warnings
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import convert_np_dtype_to_dtype_
+from ...fluid.framework import convert_np_dtype_to_dtype_, _in_eager_mode
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 import paddle
 from paddle import _C_ops, in_dynamic_mode
@@ -576,6 +576,8 @@ def relu_(x, name=None):
     Inplace version of ``relu`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_nn_cn_relu`.
     """
+    if _in_eager_mode():
+        return _C_ops.final_state_relu_(x)
     return _C_ops.relu_(x)
 
 
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index de8a7ff6d3c7b..4c30ed03735f2 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -19,6 +19,7 @@
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle import _C_ops
 from paddle import in_dynamic_mode
+from paddle.framework import _in_eager_mode
 
 __all__ = []
 
@@ -87,6 +88,8 @@ def one_hot(x, num_classes, name=None):
     """
 
     if in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_one_hot(x, num_classes)
         return _C_ops.one_hot_v2(x, 'depth', num_classes, 'allow_out_of_range',
                                  False)
     else:
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index e6efde836284a..10d4073b80c59 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -36,7 +36,7 @@
 from paddle.utils import deprecated
 from paddle import _C_ops
 from paddle import in_dynamic_mode
-from paddle.framework import core
+from paddle.framework import core, _in_eager_mode
 
 __all__ = []
 
@@ -114,7 +114,10 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
             reduction)
 
     if in_dynamic_mode():
-        out = _C_ops.bce_loss(input, label)
+        if _in_eager_mode():
+            out = _C_ops.final_state_bce_loss(input, label)
+        else:
+            out = _C_ops.bce_loss(input, label)
         if weight is not None:
             out = _C_ops.elementwise_mul(out, weight, 'axis', -1)
 
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index 5167c18de179d..6c575b4b997d6 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -59,16 +59,14 @@ class SGD(Optimizer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32')
             linear = paddle.nn.Linear(10, 10)
             inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
-            beta1 = paddle.to_tensor([0.9], dtype="float32")
-            beta2 = paddle.to_tensor([0.99], dtype="float32")
             sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
-            back = out.backward()
+            out.backward()
             sgd.step()
             sgd.clear_grad()
 
diff --git a/python/paddle/profiler/__init__.py b/python/paddle/profiler/__init__.py
index 4999e703f2a5a..ae190b8a7846c 100644
--- a/python/paddle/profiler/__init__.py
+++ b/python/paddle/profiler/__init__.py
@@ -20,7 +20,7 @@
 from .profiler_statistic import SortedKeys
 
 __all__ = [
-    'ProfilerState', 'ProfilerTarget', 'TracerEventType', 'make_scheduler',
+    'ProfilerState', 'ProfilerTarget', 'make_scheduler',
     'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent',
     'load_profiler_result', 'SortedKeys'
 ]
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index dc637bf983046..efbe88583b776 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,7 +24,7 @@
                                TracerEventType)
 
 from .utils import RecordEvent, wrap_optimizers
-from .profiler_statistic import SortedKeys
+from .profiler_statistic import StatisticData, _build_table, SortedKeys
 
 
 class ProfilerState(Enum):
@@ -32,21 +32,28 @@ class ProfilerState(Enum):
     Profiler state that can be specified to control profiler action.
 
     CLOSED: The profilers are closed.
+
     READY:  The profilers are open, but the data will not be recorded.
-            This state is used for reducing overhead influence when profilers start.
+    This state is used for reducing overhead influence when profilers start.
+
     RECORD: The profilers are open, and the data will be recorded.
-    RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period, 
-            the collected data will be returned.
+
+    RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period,
+    the collected data will be returned.
     """
     CLOSED = 0
     READY = 1
     RECORD = 2
-    RECORD_AND_RETURN = 3  # the last step of RECORD 
+    RECORD_AND_RETURN = 3  # the last step of RECORD
 
 
 class ProfilerTarget(Enum):
     r"""
     Target device for profiling.
+
+    CPU: Profile events on CPU.
+    
+    GPU: Profile events on GPU.
     """
     CPU = 0
     GPU = 1
@@ -62,17 +69,19 @@ def make_scheduler(*,
     Return a scheduler function, which scheduler the state according to the setting.
     The state transform confirms to:
 
-    (CLOSED)  (CLOSED)    (CLOSED)  (READY)    (RECORD,last RETURN)      (CLOSED)
-    START -> skip_first -> closed -> ready    ->    record       ->      END
-                            |                        |
-                            |                        | (if has_repeated < repeat)
-                            - - - - - - - - - - - -
-    Note that repeat <= 0 means the cycle will continue until the profiler exits.    
+    .. code-block:: text
+
+        (CLOSED)  (CLOSED)    (CLOSED)  (READY)    (RECORD,last RETURN)      (CLOSED)
+        START -> skip_first -> closed -> ready    ->    record       ->      END
+                                |                        |
+                                |                        | (if has_repeated < repeat)
+                                - - - - - - - - - - - -
+        Note that repeat <= 0 means the cycle will continue until the profiler exits.
 
     Parameters:
         closed(int): The number of steps in state ProfilerState.CLOSED.
-        ready(int):  The number of steps in state ProfilerState.READY. 
-        record(int): The number of steps in state ProfilerState.RECORD.    
+        ready(int):  The number of steps in state ProfilerState.READY.
+        record(int): The number of steps in state ProfilerState.RECORD.
         repeat(int): The number of cycles to repeat above state transform.
         skip_first(int): The number of first steps to drop, not participate in the state transform.
 
@@ -81,13 +90,23 @@ def make_scheduler(*,
 
     Examples:
         1. profiling range [2, 5]
+
         batch 0: closed, batch 1: ready, batch [2, 5] record
-        .. code-block:: python
-        make_scheduler(closed=1, ready=1, record=4, repeat=1)
+
+            .. code-block:: python
+
+                import paddle.profiler as profiler
+                profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
+
+
         2. profiling range [3,6], [9,12], [15,18]...
+
         batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
-        .. code-block:: python
-        make_scheduler(closed=1, ready=1, record=4, skip_first=1)
+
+            .. code-block:: python
+
+                import paddle.profiler as profiler
+                profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1)
     """
 
     def getScheduleState(step: int) -> ProfilerState:
@@ -138,15 +157,16 @@ def export_chrome_tracing(dir_name: str,
 
     Examples:
         .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = (3, 10),
-                            on_trace_ready = profiler.export_chrome_tracing('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+
+            # required: gpu
+            import paddle.profiler as profiler
+            with profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 10),
+                    on_trace_ready=profiler.export_protobuf('./log')) as p:
+                for iter in range(10):
+                    #train()
+                    p.step()
     """
     if not os.path.exists(dir_name):
         try:
@@ -181,15 +201,16 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
 
     Examples:
         .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = (3, 10),
-                            on_trace_ready = profiler.export_protobuf('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+
+            # required: gpu
+            import paddle.profiler as profiler
+            with profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 10),
+                    on_trace_ready = profiler.export_protobuf('./log')) as p:
+                for iter in range(10):
+                    #train()
+                    p.step()
     """
     if not os.path.exists(dir_name):
         try:
@@ -216,7 +237,7 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
     r"""
     Get the current supported profiler target in the system.
     """
-    if paddle.device.is_compiled_with_cuda():
+    if _Profiler.is_cupti_supported():
         return [ProfilerTarget.CPU, ProfilerTarget.GPU]
     return [ProfilerTarget.CPU]
 
@@ -226,48 +247,56 @@ class Profiler:
     Profiler context manager, user interface to manage profile process.
 
     Parameters:
-        targets (iterable): list of tracing targets, currently supported values:
-        ``paddle.profiler.ProfilerTarget.CPU``,
-        ``paddle.profiler.ProfilerTarget.GPU``.
-        scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``. 
-            If not provided, the default sheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
+        targets (iterable): list of tracing targets, currently supported values, ``ProfilerTarget.CPU``, ``ProfilerTarget.GPU`` .
+        scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``.
+            If not provided, the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
             which means profiling range [start_batch, end_batch).
         on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing.
-            This callable object will be called when ``sheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
-            
+            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
+
     Examples:
         1. profiling range [2, 5)
-        .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = (2, 5),
-                            on_trace_ready = profiler.export_chrome_tracing('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                with profiler.Profiler(
+                        targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                        scheduler = (2, 5),
+                        on_trace_ready = profiler.export_chrome_tracing('./log')) as p:
+                    for iter in range(10):
+                        #train()
+                        p.step()
+
         2. profiling range [2,4], [7, 9], [11,13]
-        .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
-                                        profiler.ProfilerTarget.GPU],
-                            scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
-                            on_trace_ready = profiler.export_chrome_tracing('./log')
-                            ) as p:
-            for iter in range(N):
-            train()
-            p.step()
+
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                with profiler.Profiler(
+                        targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                        scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
+                        on_trace_ready = profiler.export_chrome_tracing('./log')) as p:
+                    for iter in range(10):
+                        #train()
+                        p.step()
+
         3. Use profiler without context manager, and use default parameters
-        .. code-block:: python
-        import paddle.profiler as profiler
-        p = profiler.Profiler()
-        p.start()
-        for iter in range(N):
-            train()
-            p.step()
-        p.stop()
-        p.summary()
+
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                p = profiler.Profiler()
+                p.start()
+                for iter in range(10):
+                    #train()
+                    p.step()
+                p.stop()
+                p.summary()
+
     """
 
     def __init__(
@@ -334,7 +363,22 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     def start(self):
         r'''
         Start profiler and enter the first profiler step(0).
-        State transformed from CLOSED to self.current_state and trigger corresponding action. 
+        State transformed from CLOSED to self.current_state and trigger corresponding action.
+
+        Examples:
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                prof = profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (1, 9),
+                    on_trace_ready = profiler.export_chrome_tracing('./log'))
+                prof.start()
+                for iter in range(10):
+                    #train()
+                    prof.step()
+                prof.stop()
         '''
         # CLOSED -> self.current_state
         if self.current_state == ProfilerState.READY:
@@ -354,6 +398,21 @@ def stop(self):
         r'''
         Stop profiler and State transformed from self.current_state to CLOSED.
         Trigger corresponding action and post-process profiler result using self.on_trace_ready if result exists.
+
+        Examples:
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                prof = profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (1, 7),
+                    on_trace_ready = profiler.export_chrome_tracing('./log'))
+                prof.start()
+                for iter in range(10):
+                    #train()
+                    prof.step()
+                prof.stop()
         '''
         # self.current_state -> CLOSED
         # In this situation, RECORD state is regarded as RECORD_AND_RETURN
@@ -375,6 +434,22 @@ def step(self):
         r"""
         Signals the profiler that the next profiling step has started.
         Get the new ProfilerState and trigger corresponding action.
+
+        Examples:
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                prof = profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 7),
+                    on_trace_ready = profiler.export_chrome_tracing('./log'))
+
+                prof.start()
+                for iter in range(10):
+                    #train()
+                    prof.step()
+                prof.stop()
         """
         if self.record_event:
             self.record_event.end()
@@ -448,6 +523,21 @@ def _trigger_action(self):
     def export(self, path="", format="json"):
         r"""
         Exports the tracing data in Chrome tracing data format.
+
+        Examples:
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                prof = profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 7))
+                prof.start()
+                for iter in range(10):
+                    #train()
+                    prof.step()
+                prof.stop()
+                prof.export(path="./profiler_data.json", format="json")
         """
         if self.profiler_result:
             self.profiler_result.save(path, format)
@@ -461,9 +551,35 @@ def summary(self,
         Print the Summary table.
 
         Parameters:
-            sorted_by: how to rank the op table items.
-            detail: expand each operator detail information.
-            thread_sep: print op table each thread.
-            time_unit: can be chosen form ['s', 'ms', 'us', 'ns']
+            sorted_by(SortedKeys): how to rank the op table items.
+            op_detail(bool): expand each operator detail information.
+            thread_sep(bool): print op table each thread.
+            time_unit(str): can be chosen form ['s', 'ms', 'us', 'ns']
+
+        Examples:
+            .. code-block:: python
+
+                # required: gpu
+                import paddle.profiler as profiler
+                prof = profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 7),
+                    on_trace_ready = profiler.export_chrome_tracing('./log'))
+                prof.start()
+                for iter in range(10):
+                    #train()
+                    prof.step()
+                prof.stop()
+                prof.summary(sorted_by=profiler.SortedKeys.CPUTotal, op_detail=True, thread_sep=False, time_unit='ms')
         """
-        pass
+        if self.profiler_result:
+            statistic_data = StatisticData(
+                self.profiler_result.get_data(),
+                self.profiler_result.get_extra_info())
+            print(
+                _build_table(
+                    statistic_data,
+                    sorted_by=sorted_by,
+                    op_detail=op_detail,
+                    thread_sep=thread_sep,
+                    time_unit=time_unit))
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index 7400f21e91365..a0bbd6b633ef0 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -34,6 +34,22 @@
 class SortedKeys(Enum):
     r"""
     Sorted keys for printing summary table.
+
+    CPUTotal: Sorted by CPU total time.
+
+    CPUAvg: Sorted by CPU average time.
+
+    CPUMax: Sorted by CPU max time.
+
+    CPUMin: Sorted by CPU min time.
+
+    GPUTotal: Sorted by GPU total time.
+
+    GPUAvg: Sorted by GPU average time.
+
+    GPUMax: Sorted by GPU max time.
+
+    GPUMin: Sorted by GPU min time.
     """
     CPUTotal = 0
     CPUAvg = 1
@@ -642,6 +658,171 @@ def format_ratio(ratio, indent=0):
     append('')
     append('')
 
+    ###### Print Model Summary Report ######
+    model_perspective_items = statistic_data.event_summary.model_perspective_items
+    if model_perspective_items:
+        headers = [
+            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            'GPU Total / Avg / Max / Min / Ratio(%)'
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        name_column_width = 15
+        add_column(name_column_width)
+        add_column(6)
+        add_column(40)
+        add_column(40)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Model Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        accmulation_time = 0
+        row_values = [
+            'Total Time', '-', '{} / - / - / - / {}'.format(
+                format_time(
+                    total_time, unit=time_unit), format_ratio(1)),
+            '- / - / - / -/ -'
+        ]
+        append(row_format.format(*row_values))
+        for name in ['Dataloader', 'Forward', 'Backward', 'Optimization']:
+            if name in model_perspective_items:
+                item = model_perspective_items[name]
+                row_values = [
+                    '  {}'.format(name), item.call,
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.cpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_cpu_time, unit=time_unit),
+                        format_time(
+                            item.max_cpu_time, unit=time_unit),
+                        format_time(
+                            item.min_cpu_time, unit=time_unit),
+                        format_ratio(float(item.cpu_time) / total_time)),
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.gpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_gpu_time, unit=time_unit),
+                        format_time(
+                            item.max_gpu_time, unit=time_unit),
+                        format_time(
+                            item.min_gpu_time, unit=time_unit),
+                        format_ratio(float(item.gpu_time) / total_time))
+                ]
+                append(row_format.format(*row_values))
+                accmulation_time += item.cpu_time
+
+        other_time = total_time - accmulation_time
+        row_values = [
+            '  Others', '-', '{} / - / - / - / {}'.format(
+                format_time(
+                    other_time, unit=time_unit),
+                format_ratio(float(other_time) / total_time)),
+            '- / - / - / - / -'
+        ]
+        append(row_format.format(*row_values))
+        append(header_sep)
+        append('')
+        append('')
+
+    ###### Print Distribution Summary Report ######
+    if TracerEventType.Communication in statistic_data.time_range_summary.CPUTimeRange:
+        headers = [
+            'Name',
+            'Total Time',
+            'Ratio (%)',
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+
+        DEFAULT_COLUMN_WIDTH = 20
+        for _ in headers:
+            add_column(DEFAULT_COLUMN_WIDTH)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Distribution Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        cpu_communication_time_range = []
+        gpu_communication_time_range = []
+        cpu_communication_time_range = merge_ranges(
+            statistic_data.time_range_summary.CPUTimeRange[
+                TracerEventType.Communication], cpu_communication_time_range)
+        kernel_time_range = []
+        for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
+        ):
+            kernel_time_range = merge_ranges(
+                device_time_ranges[TracerEventType.Kernel],
+                kernel_time_range,
+                is_sorted=True)
+            gpu_communication_time_range = merge_ranges(
+                device_time_ranges[TracerEventType.Communication],
+                gpu_communication_time_range,
+                is_sorted=True)
+        communication_time_range = merge_ranges(
+            cpu_communication_time_range,
+            gpu_communication_time_range,
+            is_sorted=True)
+        computation_time_range = subtract_ranges(kernel_time_range,
+                                                 gpu_communication_time_range)
+        overlap_time_range = intersection_ranges(communication_time_range,
+                                                 computation_time_range)
+        communication_time = sum_ranges(communication_time_range)
+        computation_time = sum_ranges(computation_time_range)
+        overlap_time = sum_ranges(overlap_time_range)
+        row_values = [
+            'Communication', format_time(
+                communication_time, unit=time_unit),
+            format_ratio(float(communication_time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+
+        row_values = [
+            'Computation', format_time(
+                computation_time, unit=time_unit),
+            format_ratio(float(computation_time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+
+        row_values = [
+            'Overlap', format_time(
+                overlap_time, unit=time_unit),
+            format_ratio(float(overlap_time) / total_time)
+        ]
+        append(row_format.format(*row_values))
+        append(header_sep)
+        append(
+            "Note:\nCommunication time: Communication Op time and its kernel time on gpu.\n"
+            "Computation time: Kernel time, substract kernels belong to communication op.\n"
+            "Overlap time: Communication time intersect with computation time.\n"
+            "Example:\n"
+            "Communication:\n"
+            "  CPU:              |_________________|\n"
+            "  GPU:                                  |______________|\n"
+            "  Total:            |_________________| |______________|\n"
+            "Computation time(Kernel):\n"
+            "  GPU:         |________________|\n"
+            "Overlap time:       |___________|\n")
+        append('-' * line_length)
+        append('')
+        append('')
+
     ###### Print Operator Summary Report ######
     if statistic_data.event_summary.items:
         headers = [
@@ -708,11 +889,6 @@ def format_ratio(ratio, indent=0):
                 sorted_items = sorted(
                     items.items(), key=lambda x: x[1].min_gpu_time)
 
-            total_cpu_time = 0
-            total_gpu_time = 0
-            for name, item in sorted_items:
-                total_cpu_time += item.cpu_time
-                total_gpu_time += item.gpu_time
             for name, item in sorted_items:
                 row_values = [
                     name, item.call, '{} / {} / {} / {} / {}'.format(
@@ -724,7 +900,7 @@ def format_ratio(ratio, indent=0):
                             item.max_cpu_time, unit=time_unit),
                         format_time(
                             item.min_cpu_time, unit=time_unit),
-                        format_ratio(float(item.cpu_time) / total_cpu_time)),
+                        format_ratio(float(item.cpu_time) / total_time)),
                     '{} / {} / {} / {} / {}'.format(
                         format_time(
                             item.gpu_time, unit=time_unit),
@@ -734,7 +910,7 @@ def format_ratio(ratio, indent=0):
                             item.max_gpu_time, unit=time_unit),
                         format_time(
                             item.min_gpu_time, unit=time_unit),
-                        format_ratio(float(item.gpu_time) / total_gpu_time))
+                        format_ratio(float(item.gpu_time) / total_time))
                 ]
                 append(row_format.format(*row_values))
                 if op_detail:
@@ -752,8 +928,7 @@ def format_ratio(ratio, indent=0):
                                 format_time(
                                     innerop_node.min_cpu_time, unit=time_unit),
                                 format_ratio(
-                                    float(innerop_node.cpu_time) /
-                                    total_cpu_time)),
+                                    float(innerop_node.cpu_time) / total_time)),
                             '{} / {} / {} / {} / {}'.format(
                                 format_time(
                                     innerop_node.gpu_time, unit=time_unit),
@@ -764,8 +939,7 @@ def format_ratio(ratio, indent=0):
                                 format_time(
                                     innerop_node.min_gpu_time, unit=time_unit),
                                 format_ratio(
-                                    float(innerop_node.gpu_time) /
-                                    total_gpu_time))
+                                    float(innerop_node.gpu_time) / total_time))
                         ]
                         append(row_format.format(*row_values))
                         for device_node_name, devicenode in innerop_node.devices.items(
@@ -792,7 +966,7 @@ def format_ratio(ratio, indent=0):
                                         unit=time_unit),
                                     format_ratio(
                                         float(devicenode.gpu_time) /
-                                        total_gpu_time))
+                                        total_time))
                             ]
                             append(row_format.format(*row_values))
                     for device_node_name, device_node in item.devices.items():
@@ -814,11 +988,160 @@ def format_ratio(ratio, indent=0):
                                 format_time(
                                     devicenode.min_gpu_time, unit=time_unit),
                                 format_ratio(
-                                    float(devicenode.gpu_time) /
-                                    total_gpu_time))
+                                    float(devicenode.gpu_time) / total_time))
                         ]
                         append(row_format.format(*row_values))
         append(header_sep)
         append('')
         append('')
+
+    ###### Print Memory Manipulation Summary Report ######
+    if statistic_data.event_summary.memory_manipulation_items:
+        headers = [
+            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            'GPU Total / Avg / Max / Min / Ratio(%)'
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        name_column_width = 30
+        add_column(name_column_width)
+        add_column(6)
+        add_column(40)
+        add_column(40)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Memory Manipulation Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items
+        for name, item in memory_manipulation_items.items():
+            row_values = [
+                name,
+                item.call,
+                '{} / {} / {} / {} / {}'.format(
+                    format_time(
+                        item.cpu_time, unit=time_unit),
+                    format_time(
+                        item.avg_cpu_time, unit=time_unit),
+                    format_time(
+                        item.max_cpu_time, unit=time_unit),
+                    format_time(
+                        item.min_cpu_time, unit=time_unit),
+                    format_ratio(float(item.cpu_time) / total_time)),
+                '{} / {} / {} / {} / {}'.format(
+                    format_time(
+                        item.gpu_time, unit=time_unit),
+                    format_time(
+                        item.avg_gpu_time, unit=time_unit),
+                    format_time(
+                        item.max_gpu_time, unit=time_unit),
+                    format_time(
+                        item.min_gpu_time, unit=time_unit),
+                    format_ratio(float(item.gpu_time) / total_time)),
+            ]
+            append(row_format.format(*row_values))
+        append(header_sep)
+        append('')
+        append('')
+    ###### Print UserDefined Summary Report ######
+    if statistic_data.event_summary.userdefined_items:
+        headers = [
+            'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
+            'GPU Total / Avg / Max / Min / Ratio(%)'
+        ]
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        name_column_width = 30
+        add_column(name_column_width)
+        add_column(6)
+        add_column(40)
+        add_column(40)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "UserDefined Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        if thread_sep == True:
+            userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
+        else:
+            userdefined_thread_items = {
+                'All threads merged':
+                statistic_data.event_summary.userdefined_items
+            }
+        for thread_id, items in userdefined_thread_items.items():
+            append(add_title(line_length, "Thread: {}".format(thread_id)))
+            if sorted_by == SortedKeys.CPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].cpu_time, reverse=True)
+            elif sorted_by == SortedKeys.CPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_cpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.CPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_cpu_time)
+            elif sorted_by == SortedKeys.GPUTotal:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+            elif sorted_by == SortedKeys.GPUAvg:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].avg_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMax:
+                sorted_items = sorted(
+                    items.items(),
+                    key=lambda x: x[1].max_gpu_time,
+                    reverse=True)
+            elif sorted_by == SortedKeys.GPUMin:
+                sorted_items = sorted(
+                    items.items(), key=lambda x: x[1].min_gpu_time)
+
+            for name, item in sorted_items:
+                row_values = [
+                    name,
+                    item.call,
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.cpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_cpu_time, unit=time_unit),
+                        format_time(
+                            item.max_cpu_time, unit=time_unit),
+                        format_time(
+                            item.min_cpu_time, unit=time_unit),
+                        format_ratio(float(item.cpu_time) / total_time)),
+                    '{} / {} / {} / {} / {}'.format(
+                        format_time(
+                            item.gpu_time, unit=time_unit),
+                        format_time(
+                            item.avg_gpu_time, unit=time_unit),
+                        format_time(
+                            item.max_gpu_time, unit=time_unit),
+                        format_time(
+                            item.min_gpu_time, unit=time_unit),
+                        format_ratio(float(item.gpu_time) / total_time)),
+                ]
+                append(row_format.format(*row_values))
+            append(header_sep)
     return ''.join(result)
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index 642001dfbfc5a..7fa7a27bad7bf 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -1,24 +1,25 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.fluid.core import (_RecordEvent, TracerEventType,
-                               load_profiler_result)
 from typing import Any
 from warnings import warn
 import functools
 from contextlib import ContextDecorator
 
+from paddle.fluid.core import (_RecordEvent, TracerEventType)
+import paddle.fluid.core as core
+
 _AllowedEventTypeList = [
     TracerEventType.Dataloader, TracerEventType.ProfileStep,
     TracerEventType.UserDefined, TracerEventType.Forward,
@@ -32,14 +33,28 @@ class RecordEvent(ContextDecorator):
     Interface for recording a time range.
 
     Parameters:
-    name(str): Name of the record event
-    event_type(TracerEventType): Type of the record event, can be used for statistics.
+        name(str): Name of the record event
 
     Examples:
         .. code-block:: python
-        import paddle.profiler as profiler
-        with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined):
-            op1()
+
+            import paddle
+            import paddle.profiler as profiler
+            # method1: using context manager
+            with profiler.RecordEvent("record_add"):
+                data1 = paddle.randn(shape=[3])
+                data2 = paddle.randn(shape=[3])
+                result = data1 + data2
+            # method2: call begin() and end()
+            record_event = profiler.RecordEvent("record_add")
+            record_event.begin()
+            data1 = paddle.randn(shape=[3])
+            data2 = paddle.randn(shape=[3])
+            result = data1 + data2
+            record_event.end()
+
+    Note:
+        RecordEvent will take effect only when profiler is on and at the state of RECORD.
     """
 
     def __init__(self,
@@ -57,6 +72,20 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
         self.end()
 
     def begin(self):
+        r"""
+        Record the time of begining.
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.profiler as profiler
+            record_event = profiler.RecordEvent("record_sub")
+            record_event.begin()
+            data1 = paddle.randn(shape=[3])
+            data2 = paddle.randn(shape=[3])
+            result = data1 - data2
+            record_event.end()
+        """
         if self.event_type not in _AllowedEventTypeList:
             warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
                   can be recorded.".format(*_AllowedEventTypeList))
@@ -67,10 +96,51 @@ def begin(self):
             self.event = _RecordEvent(self.name, self.event_type)
 
     def end(self):
+        r'''
+        Record the time of ending.
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.profiler as profiler
+            record_event = profiler.RecordEvent("record_mul")
+            record_event.begin()
+            data1 = paddle.randn(shape=[3])
+            data2 = paddle.randn(shape=[3])
+            result = data1 * data2
+            record_event.end()
+        '''
         if self.event:
             self.event.end()
 
 
+def load_profiler_result(filename: str):
+    r"""
+    Load dumped profiler data back to memory.
+
+    Parameters:
+        filename(str): Name of the exported protobuf file of profiler data.
+
+    Returns:
+        ProfilerResult object.
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle.profiler as profiler
+            with profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
+                    scheduler = (3, 10)) as p:
+                for iter in range(10):
+                    #train()
+                    p.step()
+            p.export('test_export_protobuf.pb', format='pb')
+            profiler_result = profiler.load_profiler_result('test_export_protobuf.pb')
+    """
+    return core.load_profiler_result(filename)
+
+
 def wrap_optimizers():
     def optimizer_warpper(func):
         @functools.wraps(func)
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index cd8ba2b58a8c9..f5b225bc6da2d 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -119,10 +119,11 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
             f'Unexpected hop_length: {hop_length}. It should be an positive integer.'
         )
 
-    if frame_length > x.shape[axis]:
-        raise ValueError(
-            f'Attribute frame_length should be less equal than sequence length, '
-            f'but got ({frame_length}) > ({x.shape[axis]}).')
+    if in_dygraph_mode():
+        if frame_length > x.shape[axis]:
+            raise ValueError(
+                f'Attribute frame_length should be less equal than sequence length, '
+                f'but got ({frame_length}) > ({x.shape[axis]}).')
 
     op_type = 'frame'
 
@@ -306,8 +307,7 @@ def stft(x,
             y1 = stft(x, n_fft=512, center=False, onesided=False)  # [8, 512, 372]
     """
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'complex64', 'complex128'],
-        'stft')
+        x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft')
 
     x_rank = len(x.shape)
     assert x_rank in [1, 2], \
@@ -325,8 +325,9 @@ def stft(x,
     if win_length is None:
         win_length = n_fft
 
-    assert 0 < n_fft <= x.shape[-1], \
-        f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
+    if in_dygraph_mode():
+        assert 0 < n_fft <= x.shape[-1], \
+            f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
 
     assert 0 < win_length <= n_fft, \
         f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
@@ -359,7 +360,7 @@ def stft(x,
     x_frames = x_frames.transpose(
         perm=[0, 2,
               1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
-    x_frames = x_frames * window
+    x_frames = paddle.multiply(x_frames, window)
 
     norm = 'ortho' if normalized else 'backward'
     if is_complex(x_frames):
@@ -495,18 +496,22 @@ def istft(x,
     n_frames = x.shape[-1]
     fft_size = x.shape[-2]
 
-    if onesided:
-        assert (fft_size == n_fft // 2 + 1), \
-            'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
-    else:
-        assert (fft_size == n_fft), \
-            'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size)
+    if in_dygraph_mode():
+        if onesided:
+            assert (fft_size == n_fft // 2 + 1), \
+                'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
+        else:
+            assert (fft_size == n_fft), \
+                'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size)
 
     if window is not None:
         assert len(window.shape) == 1 and len(window) == win_length, \
             'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape)
     else:
-        window = paddle.ones(shape=(win_length, ))
+        window_dtype = paddle.float32 if x.dtype in [
+            paddle.float32, paddle.complex64
+        ] else paddle.float64
+        window = paddle.ones(shape=(win_length, ), dtype=window_dtype)
 
     if win_length < n_fft:
         pad_left = (n_fft - win_length) // 2
@@ -534,15 +539,15 @@ def istft(x,
             x = x[:, :, :n_fft // 2 + 1]
         out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
 
+    out = paddle.multiply(out, window).transpose(
+        perm=[0, 2, 1])  # (batch, n_fft, num_frames)
     out = overlap_add(
-        x=(out * window).transpose(
-            perm=[0, 2, 1]),  # (batch, n_fft, num_frames)
-        hop_length=hop_length,
-        axis=-1)  # (batch, seq_length)
+        x=out, hop_length=hop_length, axis=-1)  # (batch, seq_length)
 
     window_envelop = overlap_add(
         x=paddle.tile(
-            x=window * window, repeat_times=[n_frames, 1]).transpose(
+            x=paddle.multiply(window, window).unsqueeze(0),
+            repeat_times=[n_frames, 1]).transpose(
                 perm=[1, 0]),  # (n_fft, num_frames)
         hop_length=hop_length,
         axis=-1)  # (seq_length, )
@@ -561,7 +566,7 @@ def istft(x,
         window_envelop = window_envelop[start:start + length]
 
     # Check whether the Nonzero Overlap Add (NOLA) constraint is met.
-    if window_envelop.abs().min().item() < 1e-11:
+    if in_dygraph_mode() and window_envelop.abs().min().item() < 1e-11:
         raise ValueError(
             'Abort istft because Nonzero Overlap Add (NOLA) condition failed. For more information about NOLA constraint please see `scipy.signal.check_NOLA`(https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.check_NOLA.html).'
         )
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index f06c45cc36973..7c0c71951aa1d 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -193,7 +193,7 @@ def from_tensor(cls, tensor, name=None):
                 print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
 
         """
-        if isinstance(tensor, (Variable, core.VarBase)):
+        if isinstance(tensor, (Variable, core.VarBase, core.eager.Tensor)):
             return cls(tensor.shape, tensor.dtype, name or tensor.name)
         else:
             raise ValueError(
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 6555ba0812d08..9cef336aa54ae 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -127,7 +127,7 @@ def _handle_dtype(data, dtype):
                     "\n\tFaild to convert input data to a regular ndarray :\n\t - Usually "
                     "this means the input data contains nested lists with different lengths. "
                 )
-        elif isinstance(data, paddle.Tensor):
+        elif isinstance(data, (paddle.Tensor, core.eager.Tensor)):
             data = data._copy_to(place, False)
             data = _handle_dtype(data, dtype)
             data.stop_gradient = stop_gradient
@@ -974,6 +974,8 @@ def diag(x, offset=0, padding_value=0, name=None):
           # [4]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_diag(x, offset, padding_value)
         return _C_ops.diag_v2(x, "offset", offset, "padding_value",
                               padding_value)
 
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 040480c26faa8..06c2a82fd696d 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import itertools
+import numpy as np
 import re
 
-from .linalg import matmul, transpose
+from .linalg import dot, matmul, transpose
 from .manipulation import squeeze, unsqueeze, reshape
 from .math import multiply
 from .math import sum as paddle_sum
@@ -111,36 +112,6 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
         f"Invalid equation: duplicate output labels are found.")
 
 
-#     '''
-#     Tests if the two operands can perform a broadcast operation on the given ranges of dimensions. 
-#     We follow the Numpy broadcasting convention which states that, by lining up the shape arrays
-#     starting from the right most dimension, all the aligned dimensions either have equal sizes or
-#     one of them is sized one.
-#     Parameters
-#     ----------
-#     args:
-#         *args unpacks into operand one's axes range, shape, operand two's axes range, shape
-#     f: 
-#         if available, is used as a callback for postprocessing the aligned operand dimensions.
-#     '''
-#     xran, xshape, yran, yshape = args
-#
-#     xran_inv, yran_inv = xran[::-1], yran[::-1]
-#
-#     for xi, yi in zip(xran_inv, yran_inv):
-#         xs, ys = xshape[xi], yshape[yi]
-#         cond = xs == ys or xs == 1 or ys == 1
-#         if not cond:
-#             return False
-#
-#     if not f:
-#         return True
-#
-#     # Apply the callback to each aligned dimension pair
-#     for xi, yi in zip(xran_inv, yran_inv):
-#         f(xi, yi)
-
-
 def build_view(in_labels, out_labels):
     '''
     Build an inverse map of dimension indices. Three conditions must hold for 
@@ -291,39 +262,12 @@ def build_global_shape(g_view, g_labels, op_shapes):
 
     g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape]
 
-    g_masks = [[s > 1 for s in view_shape] for view_shape in view_shapes]
+    g_masks = [[s > 1 or s == -1 for s in view_shape]
+               for view_shape in view_shapes]
 
     return g_shape, g_masks
 
 
-def dim_strides(shape):
-    '''
-    Returns the dimension strides for a tensor shape
-    '''
-    strides = []
-    stride = 1
-    for size in shape[::-1]:
-        strides.append(stride)
-        stride = stride * size
-    return strides
-
-
-def create_view(operand, *view_def):
-    '''
-    Create and materialize a view.
-    
-    Parameters
-    ----------
-    operand:
-        the base tensor operand
-    view_def: 
-        include two lists which define the view's dimension sizes and strides
-    '''
-    assert False, f'Diagonal and trace not implemented yet.'
-    view_shape, view_strides = view_def
-    return operand.create_view(view_shape, view_strides)
-
-
 def has_duplicated_labels(labels):
     '''
     Returns True if there is any duplicate label.
@@ -337,46 +281,17 @@ def diagonalize(labels, operand):
     Merges dimensions with duplicate labels. 
     
     For those dimensions with duplicate labels, merge them into one dimension
-    which represents the diagonal elements. That requires the duplicate labeled
-    dimensions equal sized. The order of dimensions is kept unchanged up to 
-    the left-most appearance of each label.
+    which represents the diagonal elements. This requires the dimensions with
+    duplicate labels are equal sized.
     
     Examples
     -------- 
     'ijj...i' would be merged into 'ij...'
     '''
-    if not has_duplicated_labels(labels):
-        return labels, operand
-
-    strides = dim_strides(operand.shape)
-    shape = operand.shape
-    new_labels = []
-    new_shape = []
-    new_strides = []
-
-    for ax, l in enumerate(labels):
-        if l == '.' or l not in new_labels:
-            # not duplicate
-            new_labels.append(l)
-            new_strides.append(strides[ax])
-            new_shape.append(shape[ax])
-        else:
-            # duplicate label
-            diag_ax = new_labels.index(l)
-            new_strides[diag_ax] += strides[ax]
+    assert not has_duplicated_labels(labels), (
+        f'Duplicate labels are not supported.')
 
-    # Call framework API to build a new tensor
-    new_op = create_view(operand, new_shape, new_strides)
-    return new_labels, new_op
-
-
-def prod(iter, default=1):
-    if len(iter):
-        res = 1
-        for s in iter:
-            res *= s
-        return res
-    return default
+    return labels, operand
 
 
 def plan_reduce(plan, op, reduce_dims, keepdim):
@@ -408,102 +323,108 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
 
     op1_view, op2_view = [g_view[op] for op in (op1, op2)]
 
-    # Note, I may index into -1
-    I1_dims = [op1_view[ax] for ax in I if op1_view[ax] >= 0]
-    I2_dims = [op2_view[ax] for ax in I if op2_view[ax] >= 0]
-    J1_dims = [op1_view[ax] for ax in J1]
-    J2_dims = [op2_view[ax] for ax in J2]
-    K1_dims = [op1_view[ax] for ax in K]
-    K2_dims = [op2_view[ax] for ax in K]
+    I1 = [idx for idx in I if op1_view[idx] >= 0]
+    I2 = [idx for idx in I if op2_view[idx] >= 0]
+    op1_view = np.array(op1_view)
+    op1_dims = op1_view[I1 + J1 + K]
 
-    op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)]
-    op1_vshape = [s if m else 1 for s, m in zip(g_shape, op1_mask)]
-    op2_vshape = [s if m else 1 for s, m in zip(g_shape, op2_mask)]
-
-    I1_shape, J1_shape, K1_shape = [[op1_vshape[ax] for ax in axes]
-                                    for axes in (I, J1, K)]
-    I2_shape, J2_shape, K2_shape = [[op2_vshape[ax] for ax in axes]
-                                    for axes in (I, J2, K)]
+    op2_view = np.array(op2_view)
+    op2_dims = op2_view[I2 + J2 + K]
 
-    K1_size, J1_size, J2_size = prod(K1_shape), prod(J1_shape), prod(J2_shape)
+    op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)]
+    op1_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op1_mask)])
+    op2_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op2_mask)])
+    vshape = np.maximum(op1_vshape, op2_vshape)
 
-    perm1 = I1_dims + J1_dims + K1_dims
-    perm2 = I2_dims + J2_dims + K2_dims
+    i1, i2, j1, j2, k = map(len, (I1, I2, J1, J2, K))
 
-    if any(i != dim for i, dim in enumerate(perm1)):
+    if any(op1_dims != np.arange(len(op1_dims))):
         # print(f'perm1: {perm1}')
-        step = transpose, [var1], var1, perm1
+        step = transpose, [var1], var1, list(op1_dims)
         plan.add_step(step)
 
-    if any(i != dim for i, dim in enumerate(perm2)):
+    if any(op2_dims != np.arange(len(op2_dims))):
         # print(f'perm2: {perm2}')
-        step = transpose, [var2], var2, perm2
+        step = transpose, [var2], var2, list(op2_dims)
         plan.add_step(step)
 
-    # In case of no K... dimensions, do a broadcast
-    if not K:
-        # unsqueeze operands include J1...J2... dimensions
-        if J2:
-            fill_start = len(I2_dims) + len(J1)
-            fill_end = fill_start + len(J2)
-            fill = list(range(fill_start, fill_end))
-            step = unsqueeze, [var1], var1, fill
-            plan.add_step(step)
-        if J1:
-            fill_start = len(I2_dims)
-            fill_end = fill_start + len(J1)
-            fill = list(range(fill_start, fill_end))
-            step = unsqueeze, [var2], var2, fill
-            plan.add_step(step)
-        # make broadcast
-        step = multiply, [var1, var2], var2
-        plan.add_step(step)
-    # K... are there, let's reason about I... and J...
-    # In case I... and J... are empty, do the vector-vector version of matmul
-    elif not I and not J1 and not J2:
-        # merge K dimensions
-        if len(K) > 1:
-            for var in var1, var2:
-                step = reshape, [var], var, [K1_size]
-                plan.add_step(step)
-        # Build vector-vector matmul
-        step = matmul, [var1, var2], var2
-        plan.add_step(step)
-    # General case, there are K... and some I... and J..., the actual operation will be 
-    # matrix-vector or matrix-matrix multiplies, depending on the operands' shapes.
-    else:
-        # Merge J dims and K dims by reshaping
-        merged_shape1 = I1_shape + [J1_size] + [K1_size]
-        merged_shape2 = I2_shape + [J2_size] + [K1_size]
+    # Check if conditions hold for turnning the operation into a matmul
+    if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate(
+        (op1_vshape, op2_vshape)):
+        op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1])
+                                           ] + [np.prod(op1_vshape[K])]
+        op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2])
+                                           ] + [np.prod(op2_vshape[K])]
 
-        step = reshape, [var1], var1, merged_shape1
+        # Merge J dims and K dims by reshaping
+        step = reshape, [var1], var1, op1_shape
         plan.add_step(step)
-        step = reshape, [var2], var2, merged_shape2
+        step = reshape, [var2], var2, op2_shape
         plan.add_step(step)
 
         # Matmul
         step = matmul, [var1, var2], var2, False, True
         plan.add_step(step)
 
-    # The result shape is in I..., J1, J2. Let's reshape back to known dimensions
-    # Note, this is static deduction, not by reading the tensor shape at runtime
-    result_shape = [1] * len(I)
-    for i, ax in enumerate(I):
-        result_shape[i] = max(op1_vshape[ax], op2_vshape[ax])
-    if J1:
-        result_shape += J1_shape
-    if J2:
-        result_shape += J2_shape
-
-    # Need a scalar dimension somehow
-    if result_shape:
-        step = reshape, [var2], var2, result_shape
+        # Reshape back
+        shape = list(vshape[I + J1 + J2])
+        step = reshape, [var2], var2, shape
         plan.add_step(step)
 
+    elif j1 == j2 == k == 1:
+        # Can still do matmul even unknown shapes are present
+        step = matmul, [var1, var2], var2, False, True
+        plan.add_step(step)
+
+    # In the rest cases we opt for ops other than matmul 
+    else:
+        # unsqueeze operands include J1...J2... dimensions
+        if j2:
+            fill = list(range(i1 + j1, i1 + j1 + j2))
+            step = unsqueeze, [var1], var1, fill
+            plan.add_step(step)
+        if j1:
+            fill = list(range(i2, i2 + j1))
+            step = unsqueeze, [var2], var2, fill
+            plan.add_step(step)
+        # In case of no dimensions to contract, do an elementwise multiply
+        if k == 0:
+            # make broadcast
+            step = multiply, [var1, var2], var2
+            plan.add_step(step)
+        # Contract and no join, turn into a dot
+        elif j1 + j2 == 0 and k == 1:
+            step = unsqueeze, [var1], var1, [-2]
+            plan.add_step(step)
+            step = unsqueeze, [var2], var2, [-1]
+            plan.add_step(step)
+            step = matmul, [var1, var2], var2
+            plan.add_step(step)
+            step = squeeze, [var2], var2, [-1, -2]
+            plan.add_step(step)
+        elif j1 + j2 == 0 and not-1 in np.concatenate(
+            (op1_vshape[K], op2_vshape[K])):
+            assert all(op1_vshape[K] == op2_vshape[K])
+            step = reshape, [var1], var1, list(op1_vshape[
+                I]) + [1] + [np.prod(op1_vshape[K])]
+            plan.add_step(step)
+            step = reshape, [var2], var2, list(op2_vshape[
+                I]) + [1] + [np.prod(op2_vshape[K])]
+            plan.add_step(step)
+            step = matmul, [var1, var2], var2, False, True
+            plan.add_step(step)
+            step = squeeze, [var2], var2, [-1, -2]
+            plan.add_step(step)
+        else:
+            step = multiply, [var1, var2], var2
+            plan.add_step(step)
+            reduce_dims = list(range(-k, 0))
+            plan_reduce(plan, op2, reduce_dims, keepdim=False)
+
     # Wrap up, updating auxiliary data
     # Updating g_mask for I and J axes
-    for i, ax in enumerate(I + J1 + J2):
-        op2_mask[ax] = (result_shape[i] > 1)
+    for ax in I + J1 + J2:
+        op2_mask[ax] = vshape[ax] > 1 or vshape[ax] == -1
 
     for ax in K:
         op2_mask[ax] = False
@@ -514,6 +435,8 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
     for ax in I + J1 + J2:
         op2_view[ax], dim = dim, dim + 1
 
+    g_view[op2] = list(op2_view)
+
 
 def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
                    n_bcast):
@@ -737,7 +660,6 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
     return plan
 
 
-@dygraph_only
 def einsum(equation, *operands):
     r"""
     einsum(equation, *operands)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index fef1652040835..6c82539ec608d 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 from ..fluid.layer_helper import LayerHelper
-from ..framework import _varbase_creator, _dygraph_tracer
+from ..framework import _varbase_creator, _dygraph_tracer, _in_eager_mode
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..static import Variable
 
@@ -147,7 +147,9 @@ def __check_input(x, y):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
             check_variable_and_dtype(
-                val, name, ['float16', 'float32', 'float64'], 'matmul')
+                val, name,
+                ['float16', 'float32', 'float64', 'complex64', 'complex128'],
+                'matmul')
 
     __check_input(x, y)
 
@@ -1146,6 +1148,8 @@ def cross(x, y, axis=None, name=None):
             #  [0. 0. 0.]]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_cross(x, y, axis)
         if axis is not None:
             return _C_ops.cross(x, y, 'dim', axis)
         else:
@@ -1490,6 +1494,8 @@ def mv(x, vec, name=None):
             out = paddle.mv(x, vec)
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_mv(x, vec)
         out = _C_ops.mv(x, vec)
         return out
 
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 858f9139231e7..aa2d2e161181b 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -17,6 +17,7 @@
 from ..fluid.layers.layer_function_generator import templatedoc
 from ..static import Variable
 from ..framework import VarBase as Tensor
+from ..framework import _in_eager_mode
 
 # TODO: define logic functions of a tensor  
 from ..fluid.layers import is_empty  # noqa: F401
@@ -181,6 +182,9 @@ def equal(x, y, name=None):
         y = full(shape=[1], dtype=x.dtype, fill_value=y)
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_equal(x, y)
+
         return _C_ops.equal(x, y)
 
     check_variable_and_dtype(
@@ -223,6 +227,9 @@ def greater_equal(x, y, name=None):
             print(result1)  # result1 = [True False True]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_greater_equal(x, y)
+
         return _C_ops.greater_equal(x, y)
 
     check_variable_and_dtype(x, "x",
@@ -269,6 +276,9 @@ def greater_than(x, y, name=None):
             print(result1)  # result1 = [False False True]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_greater_than(x, y)
+
         return _C_ops.greater_than(x, y)
 
     check_variable_and_dtype(x, "x",
@@ -316,6 +326,9 @@ def less_equal(x, y, name=None):
             print(result1)  # result1 = [True True False]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_less_equal(x, y)
+
         return _C_ops.less_equal(x, y)
 
     check_variable_and_dtype(
@@ -359,6 +372,9 @@ def less_than(x, y, name=None):
             print(result1)  # result1 = [False True False]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_less_than(x, y)
+
         return _C_ops.less_than(x, y)
 
     check_variable_and_dtype(
@@ -402,6 +418,9 @@ def not_equal(x, y, name=None):
             print(result1)  # result1 = [False True True]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_not_equal(x, y)
+
         return _C_ops.not_equal(x, y)
 
     check_variable_and_dtype(
@@ -443,7 +462,7 @@ def is_tensor(x):
             print(check)  #False
             
     """
-    return isinstance(x, Tensor)
+    return isinstance(x, (Tensor, paddle.fluid.core.eager.Tensor))
 
 
 def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 32ccecbc6d9f0..298a18e971df4 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -16,7 +16,7 @@
 from collections import Counter
 
 from ..static import Variable, device_guard
-from ..framework import core
+from ..framework import core, _in_eager_mode
 from ..fluid.layer_helper import LayerHelper
 from ..framework import OpProtoHolder, convert_np_dtype_to_dtype_, dygraph_only
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
@@ -263,6 +263,9 @@ def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
 
 setattr(core.VarBase, 'fill_diagonal_tensor', fill_diagonal_tensor)
 
+if core._in_eager_mode():
+    setattr(core.eager.Tensor, 'fill_diagonal_tensor', fill_diagonal_tensor)
+
 
 @dygraph_only
 def tolist(x):
@@ -889,12 +892,20 @@ def stack(x, axis=0, name=None):
             x1 = paddle.to_tensor([[1.0, 2.0]])
             x2 = paddle.to_tensor([[3.0, 4.0]])
             x3 = paddle.to_tensor([[5.0, 6.0]])
+	    
             out = paddle.stack([x1, x2, x3], axis=0)
             print(out.shape)  # [3, 1, 2]
             print(out)
             # [[[1., 2.]],
             #  [[3., 4.]],
             #  [[5., 6.]]]
+	    
+	    out = paddle.stack([x1, x2, x3], axis=-2)
+	    print(out.shape)  # [1, 3, 2]
+	    print(out)
+	    # [[[1., 2.],
+	    #   [3., 4.],
+	    #   [5., 6.]]]
     """
     return layers.stack(x, axis, name)
 
@@ -1567,6 +1578,8 @@ def scatter(x, index, updates, overwrite=True, name=None):
             #  [1., 1.]]
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_scatter(x, index, updates, overwrite)
         return _C_ops.scatter(x, index, updates, 'overwrite', overwrite)
 
     check_variable_and_dtype(
@@ -1719,12 +1732,12 @@ def tile(x, repeat_times, name=None):
 
     Args:
         x (Tensor): The input tensor, its data type should be bool, float32, float64, int32 or int64.
-        repeat_times (Tensor|tuple|list): The number of repeating times. If repeat_times is a list or tuple, all its elements
+        repeat_times (list|tuple|Tensor): The number of repeating times. If repeat_times is a list or tuple, all its elements
             should be integers or 1-D Tensors with the data type int32. If repeat_times is a Tensor, it should be an 1-D Tensor with the data type int32.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        N-D Tensor. The data type is the same as ``x``.
+        N-D Tensor. The data type is the same as ``x``. The size of the i-th dimension is equal to ``x[i] * repeat_times[i]``.
 
     Examples:
         .. code-block:: python
@@ -1734,16 +1747,18 @@ def tile(x, repeat_times, name=None):
             data = paddle.to_tensor([1, 2, 3], dtype='int32')
             out = paddle.tile(data, repeat_times=[2, 1])
             np_out = out.numpy()
-            # [[1, 2, 3], [1, 2, 3]]
+            # [[1, 2, 3]
+            #  [1, 2, 3]]
 
-            out = paddle.tile(data, repeat_times=[2, 2])
+            out = paddle.tile(data, repeat_times=(2, 2))
             np_out = out.numpy()
-            # [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]]
+            # [[1, 2, 3, 1, 2, 3]
+            #  [1, 2, 3, 1, 2, 3]]
 
-            repeat_times = paddle.to_tensor([2, 1], dtype='int32')
+            repeat_times = paddle.to_tensor([1, 2], dtype='int32')
             out = paddle.tile(data, repeat_times=repeat_times)
             np_out = out.numpy()
-            # [[1, 2, 3], [1, 2, 3]]
+            # [[1, 2, 3, 1, 2, 3]]
     """
     if paddle.in_dynamic_mode():
         return _C_ops.tile(x, 'repeat_times', repeat_times)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 9a0139105651b..04ca7da104304 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -243,6 +243,8 @@ def add(x, y, name=None):
     """
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_add( x, y)
         return _C_ops.elementwise_add(x, y)
 
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
@@ -322,6 +324,8 @@ def subtract(x, y, name=None):
     axis = -1
     act = None
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_subtract(x, y)
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
     return _elementwise_op(LayerHelper(op_type, **locals()))
@@ -379,6 +383,8 @@ def divide(x, y, name=None):
     axis = -1
     act = None
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_divide( x, y)
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
@@ -508,6 +514,8 @@ def multiply(x, y, name=None):
     axis = -1
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_multiply(x, y)
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
@@ -1274,6 +1282,8 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_addmm( input, x, y, alpha, beta)
         out = _C_ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta)
         return out
 
@@ -1333,7 +1343,7 @@ def renorm(x, p, axis, max_norm):
             raise ValueError("the axis:{} should not be less than -1 * length of input_shape:{}".format(axis,-1 * len(input_shape)))
         axis = axis + len(input_shape)
     if paddle.in_dynamic_mode():
-        out = core.ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm)
+        out = _C_ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm)
         return out
 
     inputs = {'X': x}
@@ -3266,6 +3276,8 @@ def atan2(x, y, name=None):
     """
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_atan2( x, y)
         return _C_ops.atan2(x, y)
     else:
         check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'atan2')
@@ -3783,13 +3795,13 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
         attrs_1 += ('starts', starts_1)
         ends_1 = [dim_len - 1]
         attrs_1 += ('ends', ends_1)
-        input_front = _C_ops.slice(new_input, None, None, 'axes', axes, \
+        input_front = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
             'infer_flags', infer_flags, *attrs_1)
         starts_2 = [1]
         attrs_2 += ('starts', starts_2)
         ends_2 = [dim_len]
         attrs_2 += ('ends', ends_2)
-        input_back = _C_ops.slice(new_input, None, None, 'axes', axes, \
+        input_back = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
             'infer_flags', infer_flags, *attrs_2)
 
         if x.dtype == paddle.bool:
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 0ba47d79050ce..2c6a7f7ead105 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -17,7 +17,7 @@
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid import layers
-from ..framework import core
+from ..framework import core, _in_eager_mode
 from paddle.common_ops_import import convert_np_dtype_to_dtype_
 from paddle.common_ops_import import Variable
 from paddle.common_ops_import import VarDesc
@@ -123,7 +123,7 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
         axis(int, optional): Axis to compute indices along. The effective range
             is [-R, R), where R is x.ndim. when axis < 0, it works the same way
             as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
-        keepdim(bool, optional): Keep the axis that selecting max. The defalut value is False.
+        keepdim(bool, optional): Whether to keep the given axis in output. If it is True, the dimensions will be same as input x and with size one in the axis. Otherwise the output dimentions is one fewer than x since the axis is squeezed. Default is False.
         dtype(str|np.dtype, optional): Data type of the output tensor which can
                     be int32, int64. The default value is 'int64', and it will
                     return the int64 indices.
@@ -144,12 +144,15 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
                                      [6,9,2,4]])
             out1 = paddle.argmax(x)
             print(out1) # 2
-            out2 = paddle.argmax(x, axis=1)
+            out2 = paddle.argmax(x, axis=0)
             print(out2) 
-            # [2 3 1]
+            # [2, 2, 0, 1]
             out3 = paddle.argmax(x, axis=-1)
             print(out3) 
-            # [2 3 1]
+            # [2, 3, 1]
+            out4 = paddle.argmax(x, axis=0, keepdim=True)
+            print(out4)
+            # [[2, 2, 0, 1]]
     """
     if axis is not None and not isinstance(axis, int):
         raise TypeError(
@@ -200,7 +203,7 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
         axis(int, optional): Axis to compute indices along. The effective range
             is [-R, R), where R is x.ndim. when axis < 0, it works the same way
             as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
-        keepdim(bool, optional): Keep the axis that selecting min. The defalut value is False.
+        keepdim(bool, optional): Whether to keep the given axis in output. If it is True, the dimensions will be same as input x and with size one in the axis. Otherwise the output dimentions is one fewer than x since the axis is squeezed. Default is False.
         dtype(str): Data type of the output tensor which can
                     be int32, int64. The default value is 'int64', and it will
                     return the int64 indices.
@@ -221,12 +224,15 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
                                      [6,9,2,4]])
             out1 = paddle.argmin(x)
             print(out1) # 4
-            out2 = paddle.argmin(x, axis=1)
+            out2 = paddle.argmin(x, axis=0)
             print(out2) 
-            # [0 0 2]
+            # [1, 1, 1, 2]
             out3 = paddle.argmin(x, axis=-1)
             print(out3) 
-            # [0 0 2]
+            # [0, 0, 2]
+            out4 = paddle.argmin(x, axis=0, keepdim=True)
+            print(out4)
+            # [[1, 1, 1, 2]]
     """
     if axis is not None and not isinstance(axis, int):
         raise TypeError(
@@ -621,6 +627,9 @@ def where(condition, x=None, y=None, name=None):
         broadcast_condition = paddle.cast(broadcast_condition, 'bool')
 
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_where(broadcast_condition, broadcast_x,
+                                            broadcast_y)
         return _C_ops.where(broadcast_condition, broadcast_x, broadcast_y)
     else:
         helper = LayerHelper("where", **locals())
@@ -712,6 +721,8 @@ def index_sample(x, index):
 
     """
     if paddle.in_dynamic_mode():
+        if _in_eager_mode():
+            return _C_ops.final_state_index_sample(x, index)
         return _C_ops.index_sample(x, index)
 
     helper = LayerHelper("index_sample", **locals())
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 85672ec7a36e6..91e5cfe97c6cd 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -263,14 +263,7 @@ def to_string(var, prefix='Tensor'):
         data=data)
 
 
-def tensor_to_string(tensor, prefix='Tensor'):
-    indent = len(prefix) + 1
-
-    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
-
-    if not tensor._is_initialized():
-        return "Tensor(Not initialized)"
-
+def _format_dense_tensor(tensor, indent):
     np_tensor = tensor.numpy()
 
     if len(tensor.shape) == 0:
@@ -288,6 +281,26 @@ def tensor_to_string(tensor, prefix='Tensor'):
 
     data = _format_tensor(
         np_tensor, sumary, indent=indent, max_width=max_width, signed=signed)
+    return data
+
+
+def sparse_tensor_to_string(tensor, prefix='Tensor'):
+    indent = len(prefix) + 1
+    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient}, \n{indent}{data})"
+    if tensor.is_sparse_coo():
+        indices_tensor = tensor.non_zero_indices()
+        elements_tensor = tensor.non_zero_elements()
+        indices_data = _format_dense_tensor(indices_tensor, indent)
+        elements_data = _format_dense_tensor(elements_tensor, indent)
+        data = 'non_zero_indices=' + indices_data + ',\nnon_zero_elements=' + elements_data
+    else:
+        crows_tensor = tensor.non_zero_crows()
+        cols_tensor = tensor.non_zero_cols()
+        elements_tensor = tensor.non_zero_elements()
+        crows_data = _format_dense_tensor(crows_tensor, indent)
+        cols_data = _format_dense_tensor(cols_tensor, indent)
+        elements_data = _format_dense_tensor(elements_tensor, indent)
+        data = 'non_zero_crows=' + crows_data + ',\nnon_zero_cols=' + cols_data + ',\nnon_zero_elements=' + elements_data
 
     return _template.format(
         prefix=prefix,
@@ -297,3 +310,25 @@ def tensor_to_string(tensor, prefix='Tensor'):
         stop_gradient=tensor.stop_gradient,
         indent=' ' * indent,
         data=data)
+
+
+def tensor_to_string(tensor, prefix='Tensor'):
+    indent = len(prefix) + 1
+
+    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
+
+    if not tensor._is_dense_tensor_hold_allocation():
+        return "Tensor(Not initialized)"
+
+    if tensor.is_sparse():
+        return sparse_tensor_to_string(tensor, prefix)
+    else:
+        data = _format_dense_tensor(tensor, indent)
+        return _template.format(
+            prefix=prefix,
+            shape=tensor.shape,
+            dtype=tensor.dtype,
+            place=tensor._place_str,
+            stop_gradient=tensor.stop_gradient,
+            indent=' ' * indent,
+            data=data)
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 639afeb4c86fa..40d5b593a0576 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -5,6 +5,7 @@
     func : ElementwiseInferMeta
   kernel :
     func : add
+  backward : add_grad
 
 - api : cast
   args : (Tensor x, DataType out_dtype)
@@ -35,9 +36,9 @@
     func : conj
 
 - api : copy_to
-  args : (Tensor x, Backend backend, bool blocking)
+  args : (Tensor x, Place place, bool blocking)
   output : Tensor
-  invoke : copy_to_impl(x, backend, blocking)
+  invoke : copy_to_impl(x, place, blocking)
 
 - api : divide
   args : (Tensor x, Tensor y)
@@ -46,6 +47,7 @@
     func : ElementwiseInferMeta
   kernel :
     func : divide
+  backward : divide_grad
 
 - api : dot
   args : (Tensor x, Tensor y)
@@ -56,7 +58,7 @@
     func : dot
 
 - api : empty
-  args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU)
+  args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output: Tensor
   infer_meta :
     func : CreateInferMeta
@@ -68,7 +70,7 @@
     backend : place
 
 - api : empty_like
-  args : (Tensor x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED)
+  args : (Tensor x, DataType dtype = DataType::UNDEFINED, Place place = {})
   output: Tensor
   infer_meta :
     func : CreateLikeInferMeta
@@ -88,7 +90,7 @@
     func : flatten
 
 - api : full
-  args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU)
+  args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output: Tensor
   infer_meta :
     func : CreateInferMeta
@@ -100,7 +102,7 @@
     backend : place
 
 - api : full_like
-  args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED)
+  args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Place place = {})
   output: Tensor
   infer_meta :
     func : CreateLikeInferMeta
@@ -135,12 +137,21 @@
     func : ElementwiseInferMeta
   kernel :
     func : multiply
+  backward : multiply_grad
 
 - api : ones_like
-  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED)
+  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place={})
   output : Tensor
   invoke : full_like(x, 1, dtype, place)
 
+- api : pool2d
+  args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
+  output : Tensor(out)
+  infer_meta :
+    func : PoolInferMeta
+  kernel:
+    func : pool2d
+
 - api : reshape
   args : (Tensor x, ScalarArray shape)
   output : Tensor(out)
@@ -158,6 +169,7 @@
   kernel :
     func : relu
   inplace : (x -> out)
+  backward: relu_grad
 
 - api : scale
   args : (Tensor x, Scalar scale, float bias, bool bias_after_scale)
@@ -177,6 +189,14 @@
   kernel :
     func : sign
 
+- api : softmax
+  args : (Tensor x, int axis)
+  output : Tensor
+  infer_meta :
+    func : SoftmaxInferMeta
+  kernel :
+    func : sotfmax
+
 - api : split
   args : (Tensor x, ScalarArray num_or_sections, Scalar axis)
   output : Tensor[]
@@ -189,6 +209,7 @@
     func : ElementwiseInferMeta
   kernel :
     func : subtract
+  backward : subtract_grad
 
 - api : sum
   args : (Tensor x, int64[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
@@ -200,10 +221,19 @@
     data_type : x
 
 - api : zeros_like
-  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED)
+  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place = {})
   output : Tensor
   invoke : full_like(x, 0, dtype, place)
 
+
+- api : one_hot
+  args : (Tensor x, Scalar num_classes)
+  output : Tensor
+  infer_meta :
+    func : OneHotInferMeta
+  kernel :
+    func : one_hot
+
 - api : digamma
   args : (Tensor x)
   output : Tensor
@@ -249,3 +279,277 @@
   kernel :
     func : diagonal
   backward : diagonal_grad
+
+- api : gumbel_softmax
+  args : (Tensor x, float temperature, bool hard, int axis)
+  output : Tensor
+  infer_meta :
+    func : GumbelSoftmaxInferMeta
+  kernel :
+    func : gumbel_softmax
+  # backward : gumbel_softmax_grad
+
+- api : diag
+  args : (Tensor x, int offset, float padding_value)
+  output : Tensor
+  infer_meta :
+    func : DiagInferMeta
+  kernel :
+    func : diag
+
+# - api : pixel_shuffle
+#   args : (Tensor x, int upscale_factor, const std::string& data_format)
+#   output : Tensor
+#   infer_meta :
+#     func : PixelShuffleInferMeta
+#   kernel :
+#     func : pixel_shuffle
+
+- api : transpose
+  args : (Tensor x, int[] axis)
+  output : Tensor
+  infer_meta :
+    func : TransposeInferMeta
+  kernel :
+    func : transpose
+  backward : transpose_grad
+
+- api : lerp
+  args : (Tensor x, Tensor y, Tensor weight)
+  output : Tensor
+  infer_meta :
+    func : LerpInferMeta
+  kernel :
+    func : lerp
+  # backward : lerp_grad
+
+- api : scatter
+  args : (Tensor x, Tensor index, Tensor updates, bool overwrite)
+  output : Tensor
+  infer_meta :
+    func : ScatterInferMeta
+    dtype : x
+  kernel :
+    func : scatter
+  backward : scatter_grad
+
+
+- api : scatter_nd_add
+  args : (Tensor x, Tensor index, Tensor updates)
+  output : Tensor
+  infer_meta :
+    func : ScatterNdAddInferMeta
+    dtype : x
+  kernel :
+    func : scatter_nd_add
+  backward : scatter_nd_add_grad
+
+
+- api : addmm
+  args : (Tensor input, Tensor x, Tensor y, float alpha, float beta)
+  output : Tensor
+  infer_meta :
+    func : AddmmInferMeta
+  kernel :
+    func : addmm
+  backward : addmm_grad
+
+
+- api : adadelta
+  args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, float rho, float epsilon)
+  output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out)
+  infer_meta :
+    func : AdadeltaInferMeta
+  kernel :
+    func : adadelta
+
+- api : adamax
+  args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, float beta1, float beta2, float epsilon)
+  output : Tensor(param_out), Tensor(avg_squared_grad_out), Tensor(avg_squared_update_out)
+  infer_meta :
+    func : AdamaxInferMeta
+  kernel :
+    func : adamax
+
+
+
+- api : where
+  args : (Tensor condition, Tensor x, Tensor y)
+  output : Tensor
+  infer_meta :
+    func : WhereInferMeta
+  kernel :
+    func : where
+  backward : where_grad
+
+
+# BilinearTensorProductInferMeta
+
+# BroadcastTensorsInferMeta
+
+- api : less_than
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : less_than  
+
+- api : less_equal
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : less_equal
+
+- api : greater
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : greater
+
+- api : greater_equal
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : greater_equal
+
+- api : equal
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : equal
+
+- api : not_equal
+  args : (Tensor x, Tensor y, int axis = -1)
+  output : Tensor
+  infer_meta :
+    func : CompareInferMeta
+  kernel :
+    func : not_equal
+
+# - api : equal_all
+#   args : (Tensor x, Tensor y)
+#   output : Tensor
+#   infer_meta :
+#     func : CompareAllInferMeta
+#   kernel :
+#     func : equal_all
+
+
+- api : huber_loss
+  args : (Tensor input, Tensor label, float delta)
+  output : Tensor(out), Tensor(residual)
+  infer_meta :
+    func : HuberLossInferMeta
+  kernel :
+    func : huber_loss
+  # backward : huber_loss_grad
+
+- api : triangular_solve
+  args : (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular)
+  output : Tensor
+  infer_meta :
+    func : TriangularSolveInferMeta
+  kernel :
+    func : triangular_solve
+  # backward : triangular_solve_grad
+
+
+- api : index_sample
+  args : (Tensor x, Tensor index)
+  output : Tensor
+  infer_meta :
+    func : IndexSampleInferMeta
+  kernel :
+    func : index_sample
+    data_type : x
+  backward : index_sample_grad
+
+
+- api : cross
+  args : (Tensor x, Tensor y, int axis = 9)
+  output : Tensor
+  infer_meta :
+    func : CrossInferMeta
+  kernel :
+    func : cross
+  backward : cross_grad
+
+
+- api : atan2
+  args : (Tensor x, Tensor y)
+  output : Tensor
+  infer_meta :
+    func : Atan2InferMeta
+  kernel :
+    func : atan2
+  backward : atan2_grad
+
+
+- api : bce_loss
+  args : (Tensor input, Tensor label)
+  output : Tensor
+  infer_meta :
+    func : BCELossInferMeta
+  kernel :
+    func : bce_loss
+  backward : bce_loss_grad
+
+
+- api : dist
+  args : (Tensor x, Tensor y, float p)
+  output : Tensor
+  infer_meta :
+    func : DistInferMeta
+  kernel :
+    func : dist
+  # backward : dist_grad
+
+
+- api : gather_nd
+  args : (Tensor x, Tensor index)
+  output : Tensor
+  infer_meta :
+    func : GatherNdInferMeta
+  kernel :
+    func : gather_nd
+    data_type : x
+  backward : gather_nd_grad
+
+- api : gather_tree
+  args : (Tensor ids, Tensor parents)
+  output : Tensor
+  infer_meta :
+    func : GatherTreeMeta
+  kernel :
+    func : gather_tree
+
+- api : mv
+  args : (Tensor x, Tensor vec)
+  output : Tensor
+  infer_meta :
+    func : MvInferMeta
+  kernel :
+    func : mv
+  backward : mv_grad
+
+
+
+#  =================================== sep0
+
+
+#  =================================== sep1
+
+
+#  =================================== sep2
+
+
+#  =================================== sep3
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index fe68548a22a6d..a42691d320875 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -99,7 +99,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'double': 'double',
             'bool': 'bool',
             'str': 'const std::string&',
-            'Backend': 'Backend',
+            'Place': 'Place',
             'DataLayout': 'DataLayout',
             'DataType': 'DataType',
             'int64[]': 'const std::vector<int64_t>&',
@@ -118,7 +118,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'float': 'paddle::optional<float>',
             'double': 'paddle::optional<double>',
             'bool': 'paddle::optional<bool>',
-            'Backend': 'paddle::optional<Backend>',
+            'Place': 'paddle::optional<Place>',
             'DataLayout': 'paddle::optional<DataLayout>',
             'DataType': 'paddle::optional<DataType>',
             'int64[]': 'paddle::optional<std::vector<int64_t>>',
@@ -327,9 +327,9 @@ def gene_kernel_select(self) -> str:
         attr_layout_count = 0
         attr_data_type_count = 0
         for attr_name in attrs['names']:
-            if attrs['attr_info'][attr_name][0] == 'Backend':
+            if attrs['attr_info'][attr_name][0] == 'Place':
                 assert kernel['backend'] is not None, \
-                    f"{api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
+                    f"{api} api: When there is a parameter with 'Place' type in attributes, you must set backend of kernel manually."
                 attr_backend_count = attr_backend_count + 1
             if attrs['attr_info'][attr_name][0] == 'DataLayout':
                 assert kernel['layout'] is not None, \
@@ -348,8 +348,8 @@ def gene_kernel_select(self) -> str:
                 assert len(
                     vars_list
                 ) == 2, f"{api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
-                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
-                    f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
+                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Place'), \
+                    f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Place type."
                 kernel_select_code = kernel_select_code + f"""
   kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
 """
@@ -696,8 +696,9 @@ def gen_dense_tensor_kernel_code(self, code_indent, inplace_flag=False):
             code_indent)
         outputs_args, kernel_output_names, output_create = self.gene_output(
             self.outputs['types'], 'SetKernelOutput', code_indent, inplace_flag)
+        api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         return f"""
-{code_indent}  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+{code_indent}  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
 {code_indent}      "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
 {code_indent}  VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
 {code_indent}  VLOG(6) << "{self.api} API kernel: " << kernel;
@@ -709,7 +710,10 @@ def gen_dense_tensor_kernel_code(self, code_indent, inplace_flag=False):
 
 {code_indent}  using kernel_signature = {kernel_signature};
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-{code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  {{
+{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1);
+{code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  }}
 
 {code_indent}  return {self.gene_return_code()};"""
 
@@ -719,6 +723,7 @@ def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
         outputs_args, kernel_output_names, output_create = self.gene_output(
             self.outputs['types'], 'SetSelectedRowsKernelOutput', code_indent,
             inplace_flag)
+        api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         return f"""
 {code_indent}  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
 {code_indent}      "{self.kernel['func'][1]}", {{kernel_backend, kernel_layout, kernel_data_type}});
@@ -732,7 +737,10 @@ def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
 
 {code_indent}  using kernel_signature = {kernel_signature};
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-{code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  {{
+{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1);
+{code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
+{code_indent}  }}
 
 {code_indent}  return {self.gene_return_code()};"""
 
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index a404fc0178415..cf9cb65f6d1f4 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -147,6 +147,9 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/infermeta/ternary.h"
+
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 """
 
 
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index c69bbf35b9726..ff5ebd6ef682c 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -25,6 +25,47 @@
   output : Tensor(x_grad)
   invoke : scale(out_grad, scale, bias, bias_after_scale)
 
+- backward_api : add_grad
+  forward : add (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : add_grad
+
+- backward_api : subtract_grad
+  forward : subtract (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : subtract_grad
+
+- backward_api : multiply_grad
+  forward : multiply (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : multiply_grad
+
+- backward_api : divide_grad
+  forward : divide (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : divide_grad
+
+
 - backward_api : digamma_grad
   forward : digamma (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -45,6 +86,16 @@
   kernel :
     func : abs_grad
 
+- backward_api : relu_grad
+  forward : relu (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : relu_grad
+
 - backward_api : trunc_grad
   forward : trunc (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
@@ -57,7 +108,7 @@
 
 # - backward_api : norm_grad
 #   forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm)
-#   args : (Tensor out_grad, Tensor x, Tensor norm, int axis, float epsilon, bool is_test)
+#   args : (Tensor x, Tensor norm, Tensor out_grad, int axis, float epsilon, bool is_test)
 #   output : Tensor(x_grad)
 #   infer_meta :
 #     func : UnchangedInferMeta
@@ -90,3 +141,181 @@
 #     func : MatmulTripleGradInferMeta
 #   kernel :
 #     func : matmul_triple_grad
+
+# - backward_api : gumbel_softmax_grad
+#   forward : gumbel_softmax (Tensor x, float temperature, bool hard, int axis) -> Tensor(out)
+#   args : (Tensor out, Tensor out_grad, int axis)
+#   output : Tensor(x_grad)
+#   infer_meta :
+#     func : GumbelSoftmaxGradInferMeta
+#     param : [out, out_grad, axis]
+#   kernel :
+#     func : gumbel_softmax_grad
+
+
+- backward_api : transpose_grad
+  forward : transpose (Tensor x, int[] axis) -> Tensor(out)
+  args : (Tensor out_grad, int[] axis)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : TransposeGradInferMeta
+    param : [out_grad, axis]
+  kernel :
+    func : transpose_grad
+
+# - backward_api : lerp_grad
+#   forward : transpose (Tensor x, Tensor y, Tensor weight) -> Tensor(out)
+#   args : (Tensor x, Tensor y, Tensor weight, Tensor out, Tensor out_grad)
+#   output : Tensor(x_grad), Tensor(y_grad)
+#   infer_meta :
+#     func : GeneralBinaryGradInferMeta
+#     param : [x, y]
+#   kernel :
+#     func : lerp_grad
+
+
+- backward_api : scatter_grad
+  forward : scatter (Tensor x, Tensor index, Tensor updates, bool overwrite) -> Tensor(out)
+  args : (Tensor index, Tensor updates, Tensor out_grad, bool overwrite)
+  output : Tensor(x_grad), Tensor(updates_grad)
+  infer_meta :
+    func : ScatterGradInferMeta
+    param : [index, updates, out_grad, overwrite]
+  kernel :
+    func : scatter_grad
+
+- backward_api : scatter_nd_add_grad
+  forward : scatter (Tensor x, Tensor index, Tensor updates) -> Tensor(out)
+  args : (Tensor index, Tensor updates, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(updates_grad)
+  infer_meta :
+    func : ScatterNdAddGradInferMeta
+    param : [index, updates, out_grad]
+  kernel :
+    func : scatter_nd_grad
+
+- backward_api : addmm_grad
+  forward : scatter (Tensor input, Tensor x, Tensor y, float alpha, float beta) -> Tensor(out)
+  args : (Tensor input, Tensor x, Tensor y, Tensor out_grad, float alpha, float beta)
+  output : Tensor(input_grad), Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [input, x, y]
+  kernel :
+    func : addmm_grad
+
+- backward_api : where_grad
+  forward : where (Tensor condition, Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor condition, Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : where_grad
+
+# - backward_api : huber_loss_grad
+#   forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual)
+#   args : (Tensor residual, Tensor out_grad, float delta)
+#   output : Tensor(input_grad), Tensor(label_grad)
+#   infer_meta :
+#     func : GeneralBinaryGradInferMeta
+#     param : [x, y]
+#   kernel :
+#     func : where_grad
+
+# - backward_api : triangular_solve_grad
+#   forward : triangular_solve (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular) -> Tensor(out)
+#   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, bool upper, bool tranpose, bool unitriangular)
+#   output : Tensor(x_grad), Tensor(y_grad)
+#   infer_meta :
+#     func : GeneralBinaryGradInferMeta
+#     param : [x, y]
+#   kernel :
+#     func : triangular_solve_grad
+
+- backward_api : index_sample_grad
+  forward : index_sample (Tensor x, Tensor index) -> Tensor(out)
+  args : (Tensor x, Tensor index, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : index_sample_grad
+
+- backward_api : cross_grad
+  forward : cross (Tensor x, Tensor y, int axis = 9) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad, int axis)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : cross_grad
+
+- backward_api : atan2_grad
+  forward : cross (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : atan2_grad
+
+- backward_api : bce_loss_grad
+  forward : bce_loss (Tensor input, Tensor label) -> Tensor(out)
+  args : (Tensor input, Tensor label, Tensor out_grad)
+  output : Tensor(input_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [input]
+  kernel :
+    func : bce_loss_grad
+
+
+# - backward_api : dist_grad
+#   forward : dist (Tensor x, Tensor y, float p) -> Tensor(out)
+#   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, float p)
+#   output : Tensor(x_grad), Tensor(y_grad)
+#   infer_meta :
+#     func : GeneralBinaryGradInferMeta
+#     param : [x, y]
+#   kernel :
+#     func : dist_grad
+
+
+
+- backward_api : gather_nd_grad
+  forward : gather_nd (Tensor x, Tensor index) -> Tensor(out)
+  args : (Tensor x, Tensor index, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : gather_nd_grad
+
+- backward_api : mv_grad
+  forward : mv (Tensor x, Tensor vec) -> Tensor(out)
+  args : (Tensor x, Tensor vec, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(vec_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, vec]
+  kernel :
+    func : mv_grad
+
+
+
+#  =================================== sep0
+
+
+#  =================================== sep1
+
+
+#  =================================== sep2
+
+
+#  =================================== sep3
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 7417d6bb030da..5506f71f4b671 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -154,6 +154,8 @@ def source_include(header_file_path):
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/infermeta/backward.h"
+
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 """
 
 
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 9c859022e8ad1..2d1fe78b55981 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -4,18 +4,19 @@
   kernel :
     func : sparse_conv3d
     layout : x
+  backward : conv3d_grad
 
 - api : to_dense
-  args : (Tensor x, Backend backend)
+  args : (Tensor x)
   output : Tensor(out@DenseTensor)
-  invoke : to_dense_impl(x, backend)
+  invoke : to_dense_impl(x)
 
 - api : to_sparse_coo
-  args : (Tensor x, Backend backend, int64 sparse_dim)
+  args : (Tensor x, int64 sparse_dim)
   output : Tensor(out@SparseCooTensor)
-  invoke : to_sparse_coo_impl(x, backend, sparse_dim)
+  invoke : to_sparse_coo_impl(x, sparse_dim)
 
 - api : to_sparse_csr
-  args : (Tensor x, Backend backend)
+  args : (Tensor x)
   output : Tensor(out@SparseCsrTensor)
-  invoke : to_sparse_csr_impl(x, backend)
+  invoke : to_sparse_csr_impl(x)
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index dd22e16dc64f0..b4fc7638622b9 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -192,9 +192,7 @@ def source_include(header_file_path):
 
 
 def api_register():
-    return """
-PD_REGISTER_API(Test);
-"""
+    return ""
 
 
 def api_namespace():
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
index 561e198a41b99..5dac7c8c48367 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -115,9 +115,7 @@ def source_include(header_file_path):
 
 
 def api_register():
-    return """
-PD_REGISTER_API(Test);
-"""
+    return ""
 
 
 def api_namespace():
diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
index 0d018f8e3f64f..aab4b219741a6 100644
--- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
+++ b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
@@ -43,6 +43,8 @@ def gene_wrapped_infermeta_and_register(api):
                 'const std::vector<Tensor>&': 'const std::vector<MetaTensor>&',
                 'Tensor': 'MetaTensor*',
                 'std::vector<Tensor>': 'std::vector<MetaTensor>*',
+                'const paddle::optional<Tensor&>':
+                'const paddle::optional<MetaTensor&>'
             }
 
             wrapped_infermeta_name = get_wrapped_infermeta_name(api.api)
@@ -98,6 +100,7 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/infermeta/ternary.h"
 """
 
 
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 1a3dbd68066a7..9fd200bf0344d 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -327,12 +327,17 @@ class ToTensor(BaseTransform):
             import paddle.vision.transforms as T
             import paddle.vision.transforms.functional as F
 
-            fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
+            fake_img = Image.fromarray((np.random.rand(4, 5, 3) * 255.).astype(np.uint8))
 
             transform = T.ToTensor()
 
             tensor = transform(fake_img)
-
+            
+            print(tensor.shape)
+            # [3, 4, 5]
+    
+            print(tensor.dtype)
+            # paddle.float32
     """
 
     def __init__(self, data_format='CHW', keys=None):
diff --git a/python/setup.py.in b/python/setup.py.in
index 689f63c0f00e9..2dbefb20bb6e6 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -282,6 +282,12 @@ packages=['paddle',
           'paddle.distribution',
           'paddle.distributed.sharding',
           'paddle.distributed.fleet',
+          'paddle.distributed.launch',
+          'paddle.distributed.launch.context',
+          'paddle.distributed.launch.controllers',
+          'paddle.distributed.launch.job',
+          'paddle.distributed.launch.plugins',
+          'paddle.distributed.launch.utils',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.elastic',
           'paddle.distributed.fleet.meta_optimizers',
@@ -300,6 +306,7 @@ packages=['paddle',
           'paddle.distributed.fleet.meta_parallel.parallel_layers',
           'paddle.distributed.auto_parallel',
           'paddle.distributed.auto_parallel.operators',
+          'paddle.distributed.auto_parallel.tuner',
           'paddle.distributed.passes',
           'paddle.framework',
           'paddle.jit',
@@ -726,7 +733,7 @@ with redirect_stdout():
         },
         entry_points={
             'console_scripts': [
-                'fleetrun = paddle.distributed.fleet.launch:launch'
+                'fleetrun = paddle.distributed.launch.main:launch'
             ]
         },
         classifiers=[
diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh
index 2a9fb842862c2..5466a1cdd597b 100644
--- a/tools/check_added_ut.sh
+++ b/tools/check_added_ut.sh
@@ -52,9 +52,10 @@ if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
 elif [[ "$SYSTEM" == "Windows_NT" ]];then
     bash $PADDLE_ROOT/win_cmake.sh >prec_build.log 2>&1
 fi
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/br-ut
+# remove line ended with .exe to get correct deleted_ut list
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/br-ut
 cd $PADDLE_ROOT/build
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/pr-ut
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/pr-ut
 cd $PADDLE_ROOT
 grep -F -x -v -f br-ut pr-ut > $PADDLE_ROOT/added_ut
 if [[ "$SYSTEM" == 'Linux' ]];then
@@ -66,6 +67,8 @@ rm -rf prec_build
 if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
     rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
 elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    # get the deleted ut list in windows, will be used in check_change_of_unittest.sh
+    grep -F -x -v -f pr-ut br-ut > $PADDLE_ROOT/deleted_ut
     rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/win_cmake.sh
 fi
 git checkout -f $CURBRANCH
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 55d2d59c7ece6..d2892d13fc401 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -198,7 +198,9 @@ if [ ${HAS_BOOST_GET} ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 6836917 47554610 22561442
 fi
 
-HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
+# infrt needs to temporarily use LOG(FATAL) during the debugging period, and will replace it with standard error format in the future.
+NO_INFRT_FILES=`git diff --name-only upstream/develop | grep -v "tools/\|paddle/infrt/" || true`
+HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $NO_INFRT_FILES |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true`
 if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n"
     check_approval 1 6836917 47554610 22561442
@@ -229,6 +231,12 @@ if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 6888866 39303645
   fi
 
+HAS_MODIFIED_DECLARATIONS=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels/declarations.h" || true`
+if [ "${HAS_MODIFIED_DECLARATIONS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must be approved by chenwhql for any use of paddle/phi/kernels/declarations.h. Thanks!\n"
+    check_approval 1 22561442
+  fi
+
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
 if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td
index f754767259563..ae0316036f185 100644
--- a/tools/infrt/custom_pdop.td
+++ b/tools/infrt/custom_pdop.td
@@ -23,16 +23,6 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
   let arguments = (ins PD_Tensor :$inputs, StrAttr:$name);
 }
 
-def PD_ReturnOp : PD_Op<"return", [Terminator]> {
-  let summary = "return Op";
-
-  let description = [{
-    Fetch tensor from the graph.
-  }];
-
-  let arguments = (ins Variadic<PD_Tensor>:$inputs);
-}
-
 def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> {
   let summary = "paddle graph Op";
   let description = [{
@@ -52,6 +42,6 @@ def PD_ConstantOp : PD_Op<"constant", [NoSideEffect, ConstantLike, DeclareOpInte
   let hasFolder = 1;
 
   let builders = [
-    OpBuilder<(ins "Attribute":$value)>,
+    OpBuilder<(ins "mlir::Attribute":$value)>,
   ];
 }
diff --git a/tools/infrt/fake_models/multi_fc.py b/tools/infrt/fake_models/multi_fc.py
index 0d633cfc60a9b..7149c8d022afd 100644
--- a/tools/infrt/fake_models/multi_fc.py
+++ b/tools/infrt/fake_models/multi_fc.py
@@ -52,4 +52,7 @@
 exe.run(fluid.default_startup_program())
 
 fluid.io.save_inference_model("./multi_fc_model", [a.name], [fc_out], exe)
+fluid.io.save_inference_model("./multi_fc_model", [a.name], [fc_out], exe, None,
+                              "fc.pdmodel", "fc.pdiparams")
+
 print('output name', fc_out.name)
diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
index 027dfe4328a55..b0e420da64aa2 100644
--- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
+++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py
@@ -16,8 +16,6 @@
 from paddle.fluid import core
 from paddle import compat as cpt
 
-ops_having_canonicalization = {"elementwise_add", }
-
 
 # collect original ops: op which has both inference and grid defination
 def get_original_ops():
@@ -186,16 +184,31 @@ def generate_all_ops_inputs_outputs_map(op_descs):
     cpp_style_ops_outputs_map_str = start_ + ops_outputs_str + "\n};"
 
     # 3. Write to header file
-    dst_head_file = "../../paddle/infrt/dialect/pd_ops_info.h"
+    dst_head_file = "../../paddle/infrt/dialect/pd/common/pd_ops_info.h"
     with open(dst_head_file, 'w') as ops_inputs_outputs_head_file:
         ops_inputs_outputs_head_file.write(cpp_style_ops_inputs_map_str)
         ops_inputs_outputs_head_file.write("\n\n")
         ops_inputs_outputs_head_file.write(cpp_style_ops_outputs_map_str)
 
 
+def get_constraint(op_type, op_proto):
+    # 2.3.1 inputs
+    constraint = "NoSideEffect"
+
+    optional_input_num_ = 0
+    for input_ in op_proto[INPUTS]:
+        if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][input_][
+                INTERMEDIATE] != True and op_proto[INPUTS][input_][
+                    DISPENSABLE] == True:
+            optional_input_num_ += 1
+    if optional_input_num_ > 1:
+        constraint += ", AttrSizedOperandSegments"
+    return constraint
+
+
 # funtion to generate paddle op dialect file
 def convert_op_proto_into_mlir(op_descs):
-    dst_dialect_file = "../../paddle/infrt/dialect/pd_ops.td"
+    dst_dialect_file = "../../paddle/infrt/dialect/pd/ir/pd_ops.td"
     custom_dialect_file = "custom_pdop.td"
 
     # 1. Head files
@@ -214,7 +227,7 @@ def convert_op_proto_into_mlir(op_descs):
         "include \"mlir/Interfaces/InferTypeOpInterface.td\"",
         "include \"mlir/Interfaces/LoopLikeInterface.td\"",
         "include \"mlir/IR/OpBase.td\"",
-        "include \"paddle/infrt/dialect/pd_op_base.td\"",
+        "include \"paddle/infrt/dialect/pd/ir/pd_op_base.td\"",
         "",
     ]
 
@@ -239,13 +252,14 @@ def convert_op_proto_into_mlir(op_descs):
         if (op_type in skipped_op_list) or (op_type not in original_ops_):
             continue
         automatically_generated_op_dialect.append(op_type)
+        constraint_ = get_constraint(op_type, op_proto)
         # 2.1 OpDef
-        HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [NoSideEffect]> {left_brace}\n'.format(
+        HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [{constraint}]> {left_brace}\n'.format(
             op_type_capitalize=op_type.capitalize(),
+            constraint=constraint_,
             op_type=op_type,
             left_brace="{")
         SUMMARY = '  let summary = "{} op";\n'.format(op_type)
-        CANONICALIZATION = "let hasCanonicalizer = 1;" if op_type in ops_having_canonicalization else ""
 
         # 2.2 Description
         contents = ""
@@ -259,14 +273,22 @@ def convert_op_proto_into_mlir(op_descs):
         ARGUMENTS = ""
         if (len(op_proto[INPUTS]) > 0 or len(op_proto[ATTRS]) > 0):
             ARGUMENTS = "  let arguments = (ins "
+
             # 2.3.1 inputs
             for input_ in op_proto[INPUTS]:
                 if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][
                         input_][INTERMEDIATE] != True:
-                    if op_proto[INPUTS][input_][DUPLICABLE] != "true":
-                        ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + ","
+                    if op_proto[INPUTS][input_][DISPENSABLE] != True:
+                        if op_proto[INPUTS][input_][DUPLICABLE] != True:
+                            ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + ","
+                        else:
+                            ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + ","
                     else:
-                        ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + ","
+                        if op_proto[INPUTS][input_][DUPLICABLE] != True:
+                            ARGUMENTS = ARGUMENTS + " Optional<PD_Tensor>:$" + input_ + ","
+                        else:
+                            ARGUMENTS = ARGUMENTS + " Optional<PD_Tensor_Array>:$" + input_ + ","
+
             # unsupported:   BLOCK = 8;  BLOCKS = 10;
             attr_mlir_converter = {
                 0: 'SI32Attr',
@@ -335,7 +357,7 @@ def convert_op_proto_into_mlir(op_descs):
             for output_ in op_proto[OUTPUTS]:
                 if op_proto[OUTPUTS][output_][EXTRA] != True and op_proto[
                         OUTPUTS][output_][INTERMEDIATE] != True:
-                    if op_proto[OUTPUTS][output_][DUPLICABLE] != "true":
+                    if op_proto[OUTPUTS][output_][DUPLICABLE] != True:
                         outputs = outputs + "PD_Tensor:${},".format(output_)
                     else:
                         outputs = outputs + "PD_Tensor_Array:${},".format(
@@ -348,7 +370,6 @@ def convert_op_proto_into_mlir(op_descs):
             ops_mlir_file.write(DESCRIPTION)
             ops_mlir_file.write(ARGUMENTS)
             ops_mlir_file.write(RESULTS)
-            ops_mlir_file.write(CANONICALIZATION)
             ops_mlir_file.write("}\n")
 
     print("Skipped ops num: " + str(len(skipped_op_list)))
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index 36561d4e71da8..f632c9a9dba50 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -43,7 +43,8 @@
     "float64": "FLOAT64",
     "complex64": "COMPLEX64",
     "complex128": "COMPLEX128",
-    "bool": "BOOL"
+    "bool": "BOOL",
+    "Undefined": "UNK"
 }
 
 kernel_types_info_file = "./kernels.json"
diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py
index b8c4232076c50..0680e87b38b3f 100644
--- a/tools/infrt/get_compat_kernel_signature.py
+++ b/tools/infrt/get_compat_kernel_signature.py
@@ -58,8 +58,9 @@ def get_compat_kernels_info():
                     content += line
                 if (registry and ";" in line):
                     data = content.replace("\n", "").replace(
-                        " ", "").strip("return").strip(
-                            "KernelSignature(").strip("\);").replace("\"", "")
+                        " ",
+                        "").strip("return").strip("KernelSignature(").strip(
+                            "\);").replace("\"", "").replace("\\", "")
                     registry = False
                     name, registry_info = parse_compat_registry(data)
 
diff --git a/tools/infrt/get_phi_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh
index 3b9f4b7273500..febfe5d04762a 100644
--- a/tools/infrt/get_phi_kernel_function.sh
+++ b/tools/infrt/get_phi_kernel_function.sh
@@ -41,7 +41,37 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \
 grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc  \
   | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt
 
-#step 3: merge all infos
+
+#step 3:get ir's attr_name.
+ir_attr_name_info_file=`mktemp`
+# phi_cpu attr
+all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
+for ir in $all_ir_name
+do
+  attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td  | grep -Eo "Attr:.*)" \
+  | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \
+  gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
+  gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
+  gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
+  gsub(/Attr/,"");gsub(/\)/,""); \
+  gsub(/[,:]/,"");print $a}'`
+  echo phi_cpu.$ir $attr_name >> $ir_attr_name_info_file
+done
+# phi_gpu attr
+all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'`
+for ir in $all_ir_name
+do
+  attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td  | grep -Eo "Attr:.*)" \
+  | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BoolAttr/,""); \
+  gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \
+  gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \
+  gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \
+  gsub(/Attr/,"");gsub(/\)/,""); \
+  gsub(/[,:]/,"");print $a}'`
+  echo phi_gpu.$ir $attr_name >> $ir_attr_name_info_file
+done
+
+#step 4: merge all infos
 #  @input1 => phi kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout)
 #  @input2 => information from api.yaml : kernel_name kernel_function_name inferMeta_function_name 
 #  @input3 => information from wrapped_infermeta_gen : ensure the inferMeta function has
@@ -50,4 +80,5 @@ python3 ${PADDLE_ROOT}/tools/infrt/get_phi_kernel_info.py \
   --paddle_root_path ${PADDLE_ROOT} \
   --kernel_info_file $kernel_register_info_file \
   --infermeta_wrap_file ${temp_path}/wrap_info.txt \
+  --attr_info_file $ir_attr_name_info_file \
   --generate_file ${PADDLE_ROOT}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index 774f6cd6bf364..8b752f928719b 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -37,6 +37,8 @@ def parse_args():
         type=str,
         required=True,
         help="inferMeta wrap info file.")
+    parser.add_argument(
+        "--attr_info_file", type=str, required=True, help="attr info file.")
     parser.add_argument(
         "--generate_file",
         type=str,
@@ -59,6 +61,23 @@ def get_kernel_info(file_path):
     return [l.strip() for l in cont]
 
 
+def get_attr_info(file_path):
+    """
+    phi_gpu.argsort.float64.any $axisBool$descending
+    """
+    ret = {}
+    with open(file_path, 'r') as f:
+        cont = f.readlines()
+        for l in cont:
+            datas = l.strip().split(' ')
+            if len(datas) == 2:
+                attrs = datas[1].split('$')
+                ret[datas[0]] = attrs[1:]
+            else:
+                ret[datas[0]] = None
+    return ret
+
+
 def merge(infer_meta_data, kernel_data, wrap_data):
     meta_map = {}
     for api in infer_meta_data:
@@ -114,14 +133,14 @@ def gen_namespace():
 
 def gen_context(val):
     if val == "CPU":
-        return "phi::CPUContext"
-    # elif val == "GPU":
-    #     return "phi::GPUContext"
+        return "::phi::CPUContext", "phi_cpu"
+    elif val == "GPU":
+        return "::phi::GPUContext", "phi_gpu"
     # elif val == "XPU":
-    #     return "phi::XPUContext"
+    #     return "::phi::XPUContext", "phi_xpu"
     else:
         # raise Exception(f"Unknown context type {val}")
-        return ""
+        return "", ""
 
 
 def gen_layout(val):
@@ -138,12 +157,12 @@ def gen_kernel_func(val, ctx_name, dtype_name):
         ed = val.index('>')
         func_name = val[:st]
         template_name = val[st + 1:ed]
-        if 'phi::' in template_name:
-            return "&phi::" + val
+        if '::phi::' in template_name:
+            return "&::phi::" + val
         else:
-            return "&phi::" + func_name + "<phi::" + template_name + ">"
+            return "&::phi::" + func_name + "<::phi::" + template_name + ">"
     else:
-        return "&phi::" + val + "<" + dtype_name + ", " + ctx_name + ">"
+        return "&::phi::" + val + "<" + dtype_name + ", " + ctx_name + ">"
 
 
 def gen_dtype(vals: List[str]):
@@ -195,34 +214,53 @@ def gen_dtype(vals: List[str]):
     return ir_dtypes, origin_dtypes
 
 
-# TODO(wilber): Now only process CPUContext.
-def gen_register_info(resources: List[List[str]]):
+# Note: Now only process CPUContext and GPUContext.
+
+
+def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]):
     """
-    resources: [['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'], ...]
+    item: ['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta']
+    attr_data: {'phi_cpu.arg_min.float32.any': ['axisBool', 'keepdimsBool', 'flatten', 'dtype']}
     """
-    res = "void RegisterInferShapeLaunchers(host_context::KernelRegistry* registry) {"
-    for item in resources:
-        # The output string is polluted by C++ macros, here the \ is removed
-        update_item = [v.strip('\\') for v in item]
+    ctx_name, ir_ctx_name = gen_context(item[1])
+    if (ctx_name == ""):
+        return ""
+    item[2] = gen_layout(item[2])
+    ir_dtypes, origin_dtypes = gen_dtype(item[4:-1])
+    infer_shape_func = "&::phi::" + item[-1]
 
-        ctx_name = gen_context(update_item[1])
-        if (ctx_name == ""):
-            continue
-        update_item[2] = gen_layout(update_item[2])
-        ir_dtypes, origin_dtypes = gen_dtype(update_item[4:-1])
-        infer_shape_func = "&phi::" + update_item[-1]
+    res = ""
 
-        if update_item[-1] == "unknown":
-            # TODO(wilber): handle the unknown inferShape func.
-            continue
+    if item[-1] == "unknown":
+        # TODO(wilber): handle the unknown inferShape func.
+        return ""
+
+    for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
+        kernel_func = gen_kernel_func(item[3], ctx_name, origin_dtype)
+        ir_name = ir_ctx_name + '.' + item[0].lower(
+        ) + '.' + ir_dtype + '.' + item[2].lower()
+        if ir_name in attr_data.keys() and attr_data[ir_name] is not None:
+            attr_names = ', '.join(
+                ["\"" + a + "\"" for a in attr_data[ir_name]])
+            res += f"""
+registry->AddKernelWithAttrs("{ir_name}","""
+
+            res += f"""
+    std::bind(&KernelLauncherFunc<decltype({kernel_func}),
+                                  {kernel_func},
+                                  decltype({infer_shape_func}),
+                                  {infer_shape_func}>,
+              KernelLauncher<decltype({kernel_func}),
+                                  {kernel_func},
+                                  decltype({infer_shape_func}),
+                                  {infer_shape_func}>(),
+              std::placeholders::_1),
+    {{{attr_names}}});
+"""
 
-        for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
-            kernel_func = gen_kernel_func(update_item[3], ctx_name,
-                                          origin_dtype)
-            ir_name = 'phi_cpu.' + update_item[0].lower(
-            ) + '.' + ir_dtype + '.' + update_item[2].lower()
+        else:
             res += f"""
-  registry->AddKernel("{ir_name}","""
+registry->AddKernel("{ir_name}","""
 
             res += f"""
     std::bind(&KernelLauncherFunc<decltype({kernel_func}),
@@ -236,18 +274,54 @@ def gen_register_info(resources: List[List[str]]):
               std::placeholders::_1));
 """
 
+    return res
+
+
+def gen_register_info(resources: List[List[str]],
+                      attr_data: Dict[str, List[str]]):
+    """
+    resources: [['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'], ...]
+    attr_data: {'phi_cpu.arg_min.float32.any': ['axisBool', 'keepdimsBool', 'flatten', 'dtype']}
+    """
+    res = "void RegisterInferShapeLaunchers(host_context::KernelRegistry* registry) {"
+
+    # register cpu kernels.
+    for item in resources:
+        # The output string is polluted by C++ macros, here the \ is removed
+        update_item = [v.strip('\\') for v in item]
+        if update_item[1] != "CPU":
+            continue
+        code = gen_register_code_info(item, attr_data)
+        if (code == ""):
+            continue
+        res += code
+
+    # register gpu kernels.
+    res += "\n#ifdef INFRT_WITH_GPU"
+    for item in resources:
+        # The output string is polluted by C++ macros, here the \ is removed
+        update_item = [v.strip('\\') for v in item]
+        if update_item[1] != "GPU":
+            continue
+        code = gen_register_code_info(item, attr_data)
+        if (code == ""):
+            continue
+        res += code
+    res += "#endif // INFRT_WITH_GPU"
+
     res += "\n}"
     return res
 
 
 def gen_phi_kernel_register_code(resources: List[List[str]],
+                                 attr_data: Dict[str, List[str]],
                                  src_file_path: str):
     source_file = open(src_file_path, 'w')
     source_file.write(gen_warn_info())
     source_file.write(gen_include_headers())
     namespace = gen_namespace()
     source_file.write(namespace[0])
-    source_file.write(gen_register_info(resources))
+    source_file.write(gen_register_info(resources, attr_data))
     source_file.write(namespace[1])
     source_file.close()
 
@@ -257,5 +331,6 @@ def gen_phi_kernel_register_code(resources: List[List[str]],
     infer_meta_data = get_api_yaml_info(args.paddle_root_path)
     kernel_data = get_kernel_info(args.kernel_info_file)
     info_meta_wrap_data = get_kernel_info(args.infermeta_wrap_file)
+    attr_data = get_attr_info(args.attr_info_file)
     out = merge(infer_meta_data, kernel_data, info_meta_wrap_data)
-    gen_phi_kernel_register_code(out, args.generate_file)
+    gen_phi_kernel_register_code(out, attr_data, args.generate_file)
diff --git a/tools/windows/check_change_of_unittest.sh b/tools/windows/check_change_of_unittest.sh
new file mode 100644
index 0000000000000..576f0e5d238ab
--- /dev/null
+++ b/tools/windows/check_change_of_unittest.sh
@@ -0,0 +1,41 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set +x
+export PADDLE_ROOT="$(cd "$PWD/../" && pwd )"
+GITHUB_API_TOKEN=$GITHUB_API_TOKEN
+GIT_PR_ID=$AGILE_PULL_ID
+BRANCH=$BRANCH
+if [ "${GITHUB_API_TOKEN}" == "" ] || [ "${GIT_PR_ID}" == "" ];then
+    exit 0 
+fi
+
+unittest_spec_diff=$(cat $PADDLE_ROOT/deleted_ut | sed 's/^/ - /g')
+if [ "$unittest_spec_diff" != "" ]; then
+    approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+    APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244 32428676 45041955`
+    echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+    if [ "${APPROVALS}" == "FALSE" ]; then
+        echo "************************************"
+        echo -e "It is forbidden to disable or delete the unit-test.\n"
+        echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]."
+        echo -e "Then you must have one RD (kolinwei(recommended), chalsliu, XieYunshen or zhouwei25) approval for the deletion of unit-test. \n"
+        echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n"
+        echo -e "Following unit-tests are deleted in this PR: \n${unittest_spec_diff} \n"
+        echo "************************************"
+        exit 6
+    fi
+fi
+set -x